Blame sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S

Packit 6c4009
/* Function sincosf vectorized with AVX-512. KNL and SKX versions.
Packit 6c4009
   Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
   This file is part of the GNU C Library.
Packit 6c4009
Packit 6c4009
   The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
   modify it under the terms of the GNU Lesser General Public
Packit 6c4009
   License as published by the Free Software Foundation; either
Packit 6c4009
   version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
Packit 6c4009
   The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
   Lesser General Public License for more details.
Packit 6c4009
Packit 6c4009
   You should have received a copy of the GNU Lesser General Public
Packit 6c4009
   License along with the GNU C Library; if not, see
Packit 6c4009
   <http://www.gnu.org/licenses/>.  */
Packit 6c4009
Packit 6c4009
#include <sysdep.h>
Packit 6c4009
#include "svml_s_trig_data.h"
Packit 6c4009
#include "svml_s_wrapper_impl.h"
Packit 6c4009
Packit 6c4009
/*
Packit 6c4009
   ALGORITHM DESCRIPTION:
Packit 6c4009
Packit 6c4009
     1) Range reduction to [-Pi/4; +Pi/4] interval
Packit 6c4009
        a) Grab sign from source argument and save it.
Packit 6c4009
        b) Remove sign using AND operation
Packit 6c4009
        c) Getting octant Y by 2/Pi multiplication
Packit 6c4009
        d) Add "Right Shifter" value
Packit 6c4009
        e) Treat obtained value as integer S for destination sign setting.
Packit 6c4009
           SS = ((S-S&1)&2)<<30; For sin part
Packit 6c4009
           SC = ((S+S&1)&2)<<30; For cos part
Packit 6c4009
        f) Change destination sign if source sign is negative
Packit 6c4009
           using XOR operation.
Packit 6c4009
        g) Subtract "Right Shifter" (0x4B000000) value
Packit 6c4009
        h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
Packit 6c4009
           X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
Packit 6c4009
     2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
Packit 6c4009
        a) Calculate X^2 = X * X
Packit 6c4009
        b) Calculate 2 polynomials for sin and cos:
Packit 6c4009
           RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
Packit 6c4009
           RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
Packit 6c4009
        c) Swap RS & RC if first bit of obtained value after
Packit 6c4009
           Right Shifting is set to 1. Using And, Andnot & Or operations.
Packit 6c4009
     3) Destination sign setting
Packit 6c4009
        a) Set shifted destination sign using XOR operation:
Packit 6c4009
           R1 = XOR( RS, SS );
Packit 6c4009
           R2 = XOR( RC, SC ).  */
Packit 6c4009
Packit 6c4009
	.text
Packit 6c4009
ENTRY (_ZGVeN16vl4l4_sincosf_knl)
Packit 6c4009
#ifndef HAVE_AVX512DQ_ASM_SUPPORT
Packit 6c4009
WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
Packit 6c4009
#else
Packit 6c4009
        pushq     %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (8)
Packit 6c4009
        cfi_rel_offset (%rbp, 0)
Packit 6c4009
        movq      %rsp, %rbp
Packit 6c4009
        cfi_def_cfa_register (%rbp)
Packit 6c4009
        andq      $-64, %rsp
Packit 6c4009
        subq      $1344, %rsp
Packit 6c4009
        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
Packit 6c4009
        vmovaps   %zmm0, %zmm2
Packit 6c4009
        movl      $-1, %edx
Packit 6c4009
        vmovups __sAbsMask(%rax), %zmm0
Packit 6c4009
        vmovups __sInvPI(%rax), %zmm3
Packit 6c4009
Packit 6c4009
/* Absolute argument computation */
Packit 6c4009
        vpandd    %zmm0, %zmm2, %zmm1
Packit 6c4009
        vmovups __sPI1_FMA(%rax), %zmm5
Packit 6c4009
        vmovups __sSignMask(%rax), %zmm9
Packit 6c4009
        vpandnd   %zmm2, %zmm0, %zmm0
Packit 6c4009
Packit 6c4009
/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
Packit 6c4009
      X = X - Y*PI1 - Y*PI2 - Y*PI3 */
Packit 6c4009
        vmovaps   %zmm1, %zmm6
Packit 6c4009
        vmovaps   %zmm1, %zmm8
Packit 6c4009
Packit 6c4009
/* c) Getting octant Y by 2/Pi multiplication
Packit 6c4009
   d) Add "Right Shifter" value */
Packit 6c4009
        vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3
Packit 6c4009
        vmovups __sPI3_FMA(%rax), %zmm7
Packit 6c4009
Packit 6c4009
/* g) Subtract "Right Shifter" (0x4B000000) value */
Packit 6c4009
        vsubps __sRShifter(%rax), %zmm3, %zmm12
Packit 6c4009
Packit 6c4009
/* e) Treat obtained value as integer S for destination sign setting */
Packit 6c4009
        vpslld    $31, %zmm3, %zmm13
Packit 6c4009
        vmovups __sA7_FMA(%rax), %zmm14
Packit 6c4009
        vfnmadd231ps %zmm12, %zmm5, %zmm6
Packit 6c4009
Packit 6c4009
/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
Packit 6c4009
      a) Calculate X^2 = X * X
Packit 6c4009
      b) Calculate 2 polynomials for sin and cos:
Packit 6c4009
         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
Packit 6c4009
         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
Packit 6c4009
        vmovaps   %zmm14, %zmm15
Packit 6c4009
        vmovups __sA9_FMA(%rax), %zmm3
Packit 6c4009
        vcmpps    $22, __sRangeReductionVal(%rax), %zmm1, %k1
Packit 6c4009
        vpbroadcastd %edx, %zmm1{%k1}{z}
Packit 6c4009
        vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6
Packit 6c4009
        vptestmd  %zmm1, %zmm1, %k0
Packit 6c4009
        vpandd    %zmm6, %zmm9, %zmm11
Packit 6c4009
        kmovw     %k0, %ecx
Packit 6c4009
        vpxord __sOneHalf(%rax), %zmm11, %zmm4
Packit 6c4009
Packit 6c4009
/* Result sign calculations */
Packit 6c4009
        vpternlogd $150, %zmm13, %zmm9, %zmm11
Packit 6c4009
Packit 6c4009
/* Add correction term 0.5 for cos() part */
Packit 6c4009
        vaddps    %zmm4, %zmm12, %zmm10
Packit 6c4009
        vfnmadd213ps %zmm6, %zmm7, %zmm12
Packit 6c4009
        vfnmadd231ps %zmm10, %zmm5, %zmm8
Packit 6c4009
        vpxord    %zmm13, %zmm12, %zmm13
Packit 6c4009
        vmulps    %zmm13, %zmm13, %zmm12
Packit 6c4009
        vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8
Packit 6c4009
        vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15
Packit 6c4009
        vfnmadd213ps %zmm8, %zmm7, %zmm10
Packit 6c4009
        vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15
Packit 6c4009
        vpxord    %zmm11, %zmm10, %zmm5
Packit 6c4009
        vmulps    %zmm5, %zmm5, %zmm4
Packit 6c4009
        vfmadd213ps __sA3(%rax), %zmm12, %zmm15
Packit 6c4009
        vfmadd213ps %zmm14, %zmm4, %zmm3
Packit 6c4009
        vmulps    %zmm12, %zmm15, %zmm14
Packit 6c4009
        vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3
Packit 6c4009
        vfmadd213ps %zmm13, %zmm13, %zmm14
Packit 6c4009
        vfmadd213ps __sA3(%rax), %zmm4, %zmm3
Packit 6c4009
        vpxord    %zmm0, %zmm14, %zmm0
Packit 6c4009
        vmulps    %zmm4, %zmm3, %zmm3
Packit 6c4009
        vfmadd213ps %zmm5, %zmm5, %zmm3
Packit 6c4009
        testl     %ecx, %ecx
Packit 6c4009
        jne       .LBL_1_3
Packit 6c4009
Packit 6c4009
.LBL_1_2:
Packit 6c4009
        cfi_remember_state
Packit 6c4009
        vmovups   %zmm0, (%rdi)
Packit 6c4009
        vmovups   %zmm3, (%rsi)
Packit 6c4009
        movq      %rbp, %rsp
Packit 6c4009
        cfi_def_cfa_register (%rsp)
Packit 6c4009
        popq      %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (-8)
Packit 6c4009
        cfi_restore (%rbp)
Packit 6c4009
        ret
Packit 6c4009
Packit 6c4009
.LBL_1_3:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        vmovups   %zmm2, 1152(%rsp)
Packit 6c4009
        vmovups   %zmm0, 1216(%rsp)
Packit 6c4009
        vmovups   %zmm3, 1280(%rsp)
Packit 6c4009
        je        .LBL_1_2
Packit 6c4009
Packit 6c4009
        xorb      %dl, %dl
Packit 6c4009
        kmovw     %k4, 1048(%rsp)
Packit 6c4009
        xorl      %eax, %eax
Packit 6c4009
        kmovw     %k5, 1040(%rsp)
Packit 6c4009
        kmovw     %k6, 1032(%rsp)
Packit 6c4009
        kmovw     %k7, 1024(%rsp)
Packit 6c4009
        vmovups   %zmm16, 960(%rsp)
Packit 6c4009
        vmovups   %zmm17, 896(%rsp)
Packit 6c4009
        vmovups   %zmm18, 832(%rsp)
Packit 6c4009
        vmovups   %zmm19, 768(%rsp)
Packit 6c4009
        vmovups   %zmm20, 704(%rsp)
Packit 6c4009
        vmovups   %zmm21, 640(%rsp)
Packit 6c4009
        vmovups   %zmm22, 576(%rsp)
Packit 6c4009
        vmovups   %zmm23, 512(%rsp)
Packit 6c4009
        vmovups   %zmm24, 448(%rsp)
Packit 6c4009
        vmovups   %zmm25, 384(%rsp)
Packit 6c4009
        vmovups   %zmm26, 320(%rsp)
Packit 6c4009
        vmovups   %zmm27, 256(%rsp)
Packit 6c4009
        vmovups   %zmm28, 192(%rsp)
Packit 6c4009
        vmovups   %zmm29, 128(%rsp)
Packit 6c4009
        vmovups   %zmm30, 64(%rsp)
Packit 6c4009
        vmovups   %zmm31, (%rsp)
Packit 6c4009
        movq      %rsi, 1056(%rsp)
Packit 6c4009
        movq      %r12, 1096(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (12, 1096)
Packit 6c4009
        movb      %dl, %r12b
Packit 6c4009
        movq      %r13, 1088(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (13, 1088)
Packit 6c4009
        movl      %eax, %r13d
Packit 6c4009
        movq      %r14, 1080(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (14, 1080)
Packit 6c4009
        movl      %ecx, %r14d
Packit 6c4009
        movq      %r15, 1072(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (15, 1072)
Packit 6c4009
        movq      %rbx, 1064(%rsp)
Packit 6c4009
        movq      %rdi, %rbx
Packit 6c4009
        cfi_remember_state
Packit 6c4009
Packit 6c4009
.LBL_1_6:
Packit 6c4009
        btl       %r13d, %r14d
Packit 6c4009
        jc        .LBL_1_13
Packit 6c4009
Packit 6c4009
.LBL_1_7:
Packit 6c4009
        lea       1(%r13), %esi
Packit 6c4009
        btl       %esi, %r14d
Packit 6c4009
        jc        .LBL_1_10
Packit 6c4009
Packit 6c4009
.LBL_1_8:
Packit 6c4009
        addb      $1, %r12b
Packit 6c4009
        addl      $2, %r13d
Packit 6c4009
        cmpb      $16, %r12b
Packit 6c4009
        jb        .LBL_1_6
Packit 6c4009
Packit 6c4009
        movq      %rbx, %rdi
Packit 6c4009
        kmovw     1048(%rsp), %k4
Packit 6c4009
        movq      1056(%rsp), %rsi
Packit 6c4009
        kmovw     1040(%rsp), %k5
Packit 6c4009
        movq      1096(%rsp), %r12
Packit 6c4009
        cfi_restore (%r12)
Packit 6c4009
        kmovw     1032(%rsp), %k6
Packit 6c4009
        movq      1088(%rsp), %r13
Packit 6c4009
        cfi_restore (%r13)
Packit 6c4009
        kmovw     1024(%rsp), %k7
Packit 6c4009
        vmovups   960(%rsp), %zmm16
Packit 6c4009
        vmovups   896(%rsp), %zmm17
Packit 6c4009
        vmovups   832(%rsp), %zmm18
Packit 6c4009
        vmovups   768(%rsp), %zmm19
Packit 6c4009
        vmovups   704(%rsp), %zmm20
Packit 6c4009
        vmovups   640(%rsp), %zmm21
Packit 6c4009
        vmovups   576(%rsp), %zmm22
Packit 6c4009
        vmovups   512(%rsp), %zmm23
Packit 6c4009
        vmovups   448(%rsp), %zmm24
Packit 6c4009
        vmovups   384(%rsp), %zmm25
Packit 6c4009
        vmovups   320(%rsp), %zmm26
Packit 6c4009
        vmovups   256(%rsp), %zmm27
Packit 6c4009
        vmovups   192(%rsp), %zmm28
Packit 6c4009
        vmovups   128(%rsp), %zmm29
Packit 6c4009
        vmovups   64(%rsp), %zmm30
Packit 6c4009
        vmovups   (%rsp), %zmm31
Packit 6c4009
        movq      1080(%rsp), %r14
Packit 6c4009
        cfi_restore (%r14)
Packit 6c4009
        movq      1072(%rsp), %r15
Packit 6c4009
        cfi_restore (%r15)
Packit 6c4009
        movq      1064(%rsp), %rbx
Packit 6c4009
        vmovups   1216(%rsp), %zmm0
Packit 6c4009
        vmovups   1280(%rsp), %zmm3
Packit 6c4009
        jmp       .LBL_1_2
Packit 6c4009
Packit 6c4009
.LBL_1_10:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        vmovss    1156(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1220(%rsp,%r15,8)
Packit 6c4009
        vmovss    1156(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1284(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_8
Packit 6c4009
Packit 6c4009
.LBL_1_13:
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        vmovss    1152(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1216(%rsp,%r15,8)
Packit 6c4009
        vmovss    1152(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1280(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_1_7
Packit 6c4009
#endif
Packit 6c4009
END (_ZGVeN16vl4l4_sincosf_knl)
Packit 6c4009
libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl)
Packit 6c4009
Packit 6c4009
ENTRY (_ZGVeN16vl4l4_sincosf_skx)
Packit 6c4009
#ifndef HAVE_AVX512DQ_ASM_SUPPORT
Packit 6c4009
WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
Packit 6c4009
#else
Packit 6c4009
        pushq     %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (8)
Packit 6c4009
        cfi_rel_offset (%rbp, 0)
Packit 6c4009
        movq      %rsp, %rbp
Packit 6c4009
        cfi_def_cfa_register (%rbp)
Packit 6c4009
        andq      $-64, %rsp
Packit 6c4009
        subq      $1344, %rsp
Packit 6c4009
        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
Packit 6c4009
        vmovaps   %zmm0, %zmm4
Packit 6c4009
        vmovups __sAbsMask(%rax), %zmm3
Packit 6c4009
        vmovups __sInvPI(%rax), %zmm5
Packit 6c4009
        vmovups __sRShifter(%rax), %zmm6
Packit 6c4009
        vmovups __sPI1_FMA(%rax), %zmm9
Packit 6c4009
        vmovups __sPI2_FMA(%rax), %zmm10
Packit 6c4009
        vmovups __sSignMask(%rax), %zmm14
Packit 6c4009
        vmovups __sOneHalf(%rax), %zmm7
Packit 6c4009
        vmovups __sPI3_FMA(%rax), %zmm12
Packit 6c4009
Packit 6c4009
/* Absolute argument computation */
Packit 6c4009
        vandps    %zmm3, %zmm4, %zmm2
Packit 6c4009
Packit 6c4009
/* c) Getting octant Y by 2/Pi multiplication
Packit 6c4009
   d) Add "Right Shifter" value */
Packit 6c4009
        vfmadd213ps %zmm6, %zmm2, %zmm5
Packit 6c4009
        vcmpps    $18, __sRangeReductionVal(%rax), %zmm2, %k1
Packit 6c4009
Packit 6c4009
/* e) Treat obtained value as integer S for destination sign setting */
Packit 6c4009
        vpslld    $31, %zmm5, %zmm0
Packit 6c4009
Packit 6c4009
/* g) Subtract "Right Shifter" (0x4B000000) value */
Packit 6c4009
        vsubps    %zmm6, %zmm5, %zmm5
Packit 6c4009
        vmovups __sA3(%rax), %zmm6
Packit 6c4009
Packit 6c4009
/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
Packit 6c4009
      X = X - Y*PI1 - Y*PI2 - Y*PI3 */
Packit 6c4009
        vmovaps   %zmm2, %zmm11
Packit 6c4009
        vfnmadd231ps %zmm5, %zmm9, %zmm11
Packit 6c4009
        vfnmadd231ps %zmm5, %zmm10, %zmm11
Packit 6c4009
        vandps    %zmm11, %zmm14, %zmm1
Packit 6c4009
        vxorps    %zmm1, %zmm7, %zmm8
Packit 6c4009
Packit 6c4009
/* Result sign calculations */
Packit 6c4009
        vpternlogd $150, %zmm0, %zmm14, %zmm1
Packit 6c4009
        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
Packit 6c4009
Packit 6c4009
/* Add correction term 0.5 for cos() part */
Packit 6c4009
        vaddps    %zmm8, %zmm5, %zmm15
Packit 6c4009
        vfnmadd213ps %zmm11, %zmm12, %zmm5
Packit 6c4009
        vandnps   %zmm4, %zmm3, %zmm11
Packit 6c4009
        vmovups __sA7_FMA(%rax), %zmm3
Packit 6c4009
        vmovaps   %zmm2, %zmm13
Packit 6c4009
        vfnmadd231ps %zmm15, %zmm9, %zmm13
Packit 6c4009
        vxorps    %zmm0, %zmm5, %zmm9
Packit 6c4009
        vmovups __sA5_FMA(%rax), %zmm0
Packit 6c4009
        vfnmadd231ps %zmm15, %zmm10, %zmm13
Packit 6c4009
        vmulps    %zmm9, %zmm9, %zmm8
Packit 6c4009
        vfnmadd213ps %zmm13, %zmm12, %zmm15
Packit 6c4009
        vmovups __sA9_FMA(%rax), %zmm12
Packit 6c4009
        vxorps    %zmm1, %zmm15, %zmm1
Packit 6c4009
        vmulps    %zmm1, %zmm1, %zmm13
Packit 6c4009
Packit 6c4009
/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
Packit 6c4009
      a) Calculate X^2 = X * X
Packit 6c4009
      b) Calculate 2 polynomials for sin and cos:
Packit 6c4009
         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
Packit 6c4009
         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
Packit 6c4009
        vmovaps   %zmm12, %zmm7
Packit 6c4009
        vfmadd213ps %zmm3, %zmm8, %zmm7
Packit 6c4009
        vfmadd213ps %zmm3, %zmm13, %zmm12
Packit 6c4009
        vfmadd213ps %zmm0, %zmm8, %zmm7
Packit 6c4009
        vfmadd213ps %zmm0, %zmm13, %zmm12
Packit 6c4009
        vfmadd213ps %zmm6, %zmm8, %zmm7
Packit 6c4009
        vfmadd213ps %zmm6, %zmm13, %zmm12
Packit 6c4009
        vmulps    %zmm8, %zmm7, %zmm10
Packit 6c4009
        vmulps    %zmm13, %zmm12, %zmm3
Packit 6c4009
        vfmadd213ps %zmm9, %zmm9, %zmm10
Packit 6c4009
        vfmadd213ps %zmm1, %zmm1, %zmm3
Packit 6c4009
        vxorps    %zmm11, %zmm10, %zmm0
Packit 6c4009
        vpandnd   %zmm2, %zmm2, %zmm14{%k1}
Packit 6c4009
        vptestmd  %zmm14, %zmm14, %k0
Packit 6c4009
        kmovw     %k0, %ecx
Packit 6c4009
        testl     %ecx, %ecx
Packit 6c4009
        jne       .LBL_2_3
Packit 6c4009
Packit 6c4009
.LBL_2_2:
Packit 6c4009
        cfi_remember_state
Packit 6c4009
        vmovups   %zmm0, (%rdi)
Packit 6c4009
        vmovups   %zmm3, (%rsi)
Packit 6c4009
        movq      %rbp, %rsp
Packit 6c4009
        cfi_def_cfa_register (%rsp)
Packit 6c4009
        popq      %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (-8)
Packit 6c4009
        cfi_restore (%rbp)
Packit 6c4009
        ret
Packit 6c4009
Packit 6c4009
.LBL_2_3:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        vmovups   %zmm4, 1152(%rsp)
Packit 6c4009
        vmovups   %zmm0, 1216(%rsp)
Packit 6c4009
        vmovups   %zmm3, 1280(%rsp)
Packit 6c4009
        je        .LBL_2_2
Packit 6c4009
Packit 6c4009
        xorb      %dl, %dl
Packit 6c4009
        xorl      %eax, %eax
Packit 6c4009
        kmovw     %k4, 1048(%rsp)
Packit 6c4009
        kmovw     %k5, 1040(%rsp)
Packit 6c4009
        kmovw     %k6, 1032(%rsp)
Packit 6c4009
        kmovw     %k7, 1024(%rsp)
Packit 6c4009
        vmovups   %zmm16, 960(%rsp)
Packit 6c4009
        vmovups   %zmm17, 896(%rsp)
Packit 6c4009
        vmovups   %zmm18, 832(%rsp)
Packit 6c4009
        vmovups   %zmm19, 768(%rsp)
Packit 6c4009
        vmovups   %zmm20, 704(%rsp)
Packit 6c4009
        vmovups   %zmm21, 640(%rsp)
Packit 6c4009
        vmovups   %zmm22, 576(%rsp)
Packit 6c4009
        vmovups   %zmm23, 512(%rsp)
Packit 6c4009
        vmovups   %zmm24, 448(%rsp)
Packit 6c4009
        vmovups   %zmm25, 384(%rsp)
Packit 6c4009
        vmovups   %zmm26, 320(%rsp)
Packit 6c4009
        vmovups   %zmm27, 256(%rsp)
Packit 6c4009
        vmovups   %zmm28, 192(%rsp)
Packit 6c4009
        vmovups   %zmm29, 128(%rsp)
Packit 6c4009
        vmovups   %zmm30, 64(%rsp)
Packit 6c4009
        vmovups   %zmm31, (%rsp)
Packit 6c4009
        movq      %rsi, 1056(%rsp)
Packit 6c4009
        movq      %r12, 1096(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (12, 1096)
Packit 6c4009
        movb      %dl, %r12b
Packit 6c4009
        movq      %r13, 1088(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (13, 1088)
Packit 6c4009
        movl      %eax, %r13d
Packit 6c4009
        movq      %r14, 1080(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (14, 1080)
Packit 6c4009
        movl      %ecx, %r14d
Packit 6c4009
        movq      %r15, 1072(%rsp)
Packit 6c4009
        cfi_offset_rel_rsp (15, 1072)
Packit 6c4009
        movq      %rbx, 1064(%rsp)
Packit 6c4009
        movq      %rdi, %rbx
Packit 6c4009
        cfi_remember_state
Packit 6c4009
Packit 6c4009
.LBL_2_6:
Packit 6c4009
        btl       %r13d, %r14d
Packit 6c4009
        jc        .LBL_2_13
Packit 6c4009
Packit 6c4009
.LBL_2_7:
Packit 6c4009
        lea       1(%r13), %esi
Packit 6c4009
        btl       %esi, %r14d
Packit 6c4009
        jc        .LBL_2_10
Packit 6c4009
Packit 6c4009
.LBL_2_8:
Packit 6c4009
        incb      %r12b
Packit 6c4009
        addl      $2, %r13d
Packit 6c4009
        cmpb      $16, %r12b
Packit 6c4009
        jb        .LBL_2_6
Packit 6c4009
Packit 6c4009
        kmovw     1048(%rsp), %k4
Packit 6c4009
        movq      %rbx, %rdi
Packit 6c4009
        kmovw     1040(%rsp), %k5
Packit 6c4009
        kmovw     1032(%rsp), %k6
Packit 6c4009
        kmovw     1024(%rsp), %k7
Packit 6c4009
        vmovups   960(%rsp), %zmm16
Packit 6c4009
        vmovups   896(%rsp), %zmm17
Packit 6c4009
        vmovups   832(%rsp), %zmm18
Packit 6c4009
        vmovups   768(%rsp), %zmm19
Packit 6c4009
        vmovups   704(%rsp), %zmm20
Packit 6c4009
        vmovups   640(%rsp), %zmm21
Packit 6c4009
        vmovups   576(%rsp), %zmm22
Packit 6c4009
        vmovups   512(%rsp), %zmm23
Packit 6c4009
        vmovups   448(%rsp), %zmm24
Packit 6c4009
        vmovups   384(%rsp), %zmm25
Packit 6c4009
        vmovups   320(%rsp), %zmm26
Packit 6c4009
        vmovups   256(%rsp), %zmm27
Packit 6c4009
        vmovups   192(%rsp), %zmm28
Packit 6c4009
        vmovups   128(%rsp), %zmm29
Packit 6c4009
        vmovups   64(%rsp), %zmm30
Packit 6c4009
        vmovups   (%rsp), %zmm31
Packit 6c4009
        vmovups   1216(%rsp), %zmm0
Packit 6c4009
        vmovups   1280(%rsp), %zmm3
Packit 6c4009
        movq      1056(%rsp), %rsi
Packit 6c4009
        movq      1096(%rsp), %r12
Packit 6c4009
        cfi_restore (%r12)
Packit 6c4009
        movq      1088(%rsp), %r13
Packit 6c4009
        cfi_restore (%r13)
Packit 6c4009
        movq      1080(%rsp), %r14
Packit 6c4009
        cfi_restore (%r14)
Packit 6c4009
        movq      1072(%rsp), %r15
Packit 6c4009
        cfi_restore (%r15)
Packit 6c4009
        movq      1064(%rsp), %rbx
Packit 6c4009
        jmp       .LBL_2_2
Packit 6c4009
Packit 6c4009
.LBL_2_10:
Packit 6c4009
        cfi_restore_state
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        vmovss    1156(%rsp,%r15,8), %xmm0
Packit 6c4009
        vzeroupper
Packit 6c4009
        vmovss    1156(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1220(%rsp,%r15,8)
Packit 6c4009
        vmovss    1156(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1284(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_2_8
Packit 6c4009
Packit 6c4009
.LBL_2_13:
Packit 6c4009
        movzbl    %r12b, %r15d
Packit 6c4009
        vmovss    1152(%rsp,%r15,8), %xmm0
Packit 6c4009
        vzeroupper
Packit 6c4009
        vmovss    1152(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(sinf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1216(%rsp,%r15,8)
Packit 6c4009
        vmovss    1152(%rsp,%r15,8), %xmm0
Packit 6c4009
Packit 6c4009
        call      JUMPTARGET(cosf)
Packit 6c4009
Packit 6c4009
        vmovss    %xmm0, 1280(%rsp,%r15,8)
Packit 6c4009
        jmp       .LBL_2_7
Packit 6c4009
#endif
Packit 6c4009
END (_ZGVeN16vl4l4_sincosf_skx)
Packit 6c4009
libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
Packit 6c4009
Packit 6c4009
/* Wrapper between vvv and vl4l4 vector variants.  */
Packit 6c4009
.macro WRAPPER_AVX512_vvv_vl4l4 callee
Packit 6c4009
#ifndef __ILP32__
Packit 6c4009
        pushq     %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (8)
Packit 6c4009
        cfi_rel_offset (%rbp, 0)
Packit 6c4009
        movq      %rsp, %rbp
Packit 6c4009
        cfi_def_cfa_register (%rbp)
Packit 6c4009
        andq      $-64, %rsp
Packit 6c4009
        subq      $384, %rsp
Packit 6c4009
        vmovups   %zmm1, 128(%rsp)
Packit 6c4009
        lea       (%rsp), %rdi
Packit 6c4009
        vmovups   %zmm2, 192(%rdi)
Packit 6c4009
        vmovups   %zmm3, 256(%rdi)
Packit 6c4009
        vmovups   %zmm4, 320(%rdi)
Packit 6c4009
        lea       64(%rsp), %rsi
Packit 6c4009
        call      HIDDEN_JUMPTARGET(\callee)
Packit 6c4009
        movq      128(%rsp), %rdx
Packit 6c4009
        movq      136(%rsp), %rsi
Packit 6c4009
        movq      144(%rsp), %r8
Packit 6c4009
        movq      152(%rsp), %r10
Packit 6c4009
        movl      (%rsp), %eax
Packit 6c4009
        movl      4(%rsp), %ecx
Packit 6c4009
        movl      8(%rsp), %edi
Packit 6c4009
        movl      12(%rsp), %r9d
Packit 6c4009
        movl      %eax, (%rdx)
Packit 6c4009
        movl      %ecx, (%rsi)
Packit 6c4009
        movq      160(%rsp), %rax
Packit 6c4009
        movq      168(%rsp), %rcx
Packit 6c4009
        movl      %edi, (%r8)
Packit 6c4009
        movl      %r9d, (%r10)
Packit 6c4009
        movq      176(%rsp), %rdi
Packit 6c4009
        movq      184(%rsp), %r9
Packit 6c4009
        movl      16(%rsp), %r11d
Packit 6c4009
        movl      20(%rsp), %edx
Packit 6c4009
        movl      24(%rsp), %esi
Packit 6c4009
        movl      28(%rsp), %r8d
Packit 6c4009
        movl      %r11d, (%rax)
Packit 6c4009
        movl      %edx, (%rcx)
Packit 6c4009
        movq      192(%rsp), %r11
Packit 6c4009
        movq      200(%rsp), %rdx
Packit 6c4009
        movl      %esi, (%rdi)
Packit 6c4009
        movl      %r8d, (%r9)
Packit 6c4009
        movq      208(%rsp), %rsi
Packit 6c4009
        movq      216(%rsp), %r8
Packit 6c4009
        movl      32(%rsp), %r10d
Packit 6c4009
        movl      36(%rsp), %eax
Packit 6c4009
        movl      40(%rsp), %ecx
Packit 6c4009
        movl      44(%rsp), %edi
Packit 6c4009
        movl      %r10d, (%r11)
Packit 6c4009
        movl      %eax, (%rdx)
Packit 6c4009
        movq      224(%rsp), %r10
Packit 6c4009
        movq      232(%rsp), %rax
Packit 6c4009
        movl      %ecx, (%rsi)
Packit 6c4009
        movl      %edi, (%r8)
Packit 6c4009
        movq      240(%rsp), %rcx
Packit 6c4009
        movq      248(%rsp), %rdi
Packit 6c4009
        movl      48(%rsp), %r9d
Packit 6c4009
        movl      52(%rsp), %r11d
Packit 6c4009
        movl      56(%rsp), %edx
Packit 6c4009
        movl      60(%rsp), %esi
Packit 6c4009
        movl      %r9d, (%r10)
Packit 6c4009
        movl      %r11d, (%rax)
Packit 6c4009
        movq      256(%rsp), %r9
Packit 6c4009
        movq      264(%rsp), %r11
Packit 6c4009
        movl      %edx, (%rcx)
Packit 6c4009
        movl      %esi, (%rdi)
Packit 6c4009
        movq      272(%rsp), %rdx
Packit 6c4009
        movq      280(%rsp), %rsi
Packit 6c4009
        movl      64(%rsp), %r8d
Packit 6c4009
        movl      68(%rsp), %r10d
Packit 6c4009
        movl      72(%rsp), %eax
Packit 6c4009
        movl      76(%rsp), %ecx
Packit 6c4009
        movl      %r8d, (%r9)
Packit 6c4009
        movl      %r10d, (%r11)
Packit 6c4009
        movq      288(%rsp), %r8
Packit 6c4009
        movq      296(%rsp), %r10
Packit 6c4009
        movl      %eax, (%rdx)
Packit 6c4009
        movl      %ecx, (%rsi)
Packit 6c4009
        movq      304(%rsp), %rax
Packit 6c4009
        movq      312(%rsp), %rcx
Packit 6c4009
        movl      80(%rsp), %edi
Packit 6c4009
        movl      84(%rsp), %r9d
Packit 6c4009
        movl      88(%rsp), %r11d
Packit 6c4009
        movl      92(%rsp), %edx
Packit 6c4009
        movl      %edi, (%r8)
Packit 6c4009
        movl      %r9d, (%r10)
Packit 6c4009
        movq      320(%rsp), %rdi
Packit 6c4009
        movq      328(%rsp), %r9
Packit 6c4009
        movl      %r11d, (%rax)
Packit 6c4009
        movl      %edx, (%rcx)
Packit 6c4009
        movq      336(%rsp), %r11
Packit 6c4009
        movq      344(%rsp), %rdx
Packit 6c4009
        movl      96(%rsp), %esi
Packit 6c4009
        movl      100(%rsp), %r8d
Packit 6c4009
        movl      104(%rsp), %r10d
Packit 6c4009
        movl      108(%rsp), %eax
Packit 6c4009
        movl      %esi, (%rdi)
Packit 6c4009
        movl      %r8d, (%r9)
Packit 6c4009
        movq      352(%rsp), %rsi
Packit 6c4009
        movq      360(%rsp), %r8
Packit 6c4009
        movl      %r10d, (%r11)
Packit 6c4009
        movl      %eax, (%rdx)
Packit 6c4009
        movq      368(%rsp), %r10
Packit 6c4009
        movq      376(%rsp), %rax
Packit 6c4009
        movl      112(%rsp), %ecx
Packit 6c4009
        movl      116(%rsp), %edi
Packit 6c4009
        movl      120(%rsp), %r9d
Packit 6c4009
        movl      124(%rsp), %r11d
Packit 6c4009
        movl      %ecx, (%rsi)
Packit 6c4009
        movl      %edi, (%r8)
Packit 6c4009
        movl      %r9d, (%r10)
Packit 6c4009
        movl      %r11d, (%rax)
Packit 6c4009
        movq      %rbp, %rsp
Packit 6c4009
        cfi_def_cfa_register (%rsp)
Packit 6c4009
        popq      %rbp
Packit 6c4009
        cfi_adjust_cfa_offset (-8)
Packit 6c4009
        cfi_restore (%rbp)
Packit 6c4009
        ret
Packit 6c4009
#else
Packit 6c4009
        leal    8(%rsp), %r10d
Packit 6c4009
        .cfi_def_cfa 10, 0
Packit 6c4009
        andl    $-64, %esp
Packit 6c4009
        pushq   -8(%r10d)
Packit 6c4009
        pushq   %rbp
Packit 6c4009
        .cfi_escape 0x10,0x6,0x2,0x76,0
Packit 6c4009
        movl    %esp, %ebp
Packit 6c4009
        pushq   %r10
Packit 6c4009
        .cfi_escape 0xf,0x3,0x76,0x78,0x6
Packit 6c4009
        leal    -112(%rbp), %esi
Packit 6c4009
        leal    -176(%rbp), %edi
Packit 6c4009
        subl    $296, %esp
Packit 6c4009
        vmovdqa64 %zmm1, -240(%ebp)
Packit 6c4009
        vmovdqa64 %zmm2, -304(%ebp)
Packit 6c4009
        call    HIDDEN_JUMPTARGET(\callee)
Packit 6c4009
        movl    -240(%ebp), %eax
Packit 6c4009
        vmovss  -176(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -236(%ebp), %eax
Packit 6c4009
        vmovss  -172(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -232(%ebp), %eax
Packit 6c4009
        vmovss  -168(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -228(%ebp), %eax
Packit 6c4009
        vmovss  -164(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -224(%ebp), %eax
Packit 6c4009
        vmovss  -160(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -220(%ebp), %eax
Packit 6c4009
        vmovss  -156(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -216(%ebp), %eax
Packit 6c4009
        vmovss  -152(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -212(%ebp), %eax
Packit 6c4009
        vmovss  -148(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -208(%ebp), %eax
Packit 6c4009
        vmovss  -144(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -204(%ebp), %eax
Packit 6c4009
        vmovss  -140(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -200(%ebp), %eax
Packit 6c4009
        vmovss  -136(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -196(%ebp), %eax
Packit 6c4009
        vmovss  -132(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -192(%ebp), %eax
Packit 6c4009
        vmovss  -128(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -188(%ebp), %eax
Packit 6c4009
        vmovss  -124(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -184(%ebp), %eax
Packit 6c4009
        vmovss  -120(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -180(%ebp), %eax
Packit 6c4009
        vmovss  -116(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -304(%ebp), %eax
Packit 6c4009
        vmovss  -112(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -300(%ebp), %eax
Packit 6c4009
        vmovss  -108(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -296(%ebp), %eax
Packit 6c4009
        vmovss  -104(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -292(%ebp), %eax
Packit 6c4009
        vmovss  -100(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -288(%ebp), %eax
Packit 6c4009
        vmovss  -96(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -284(%ebp), %eax
Packit 6c4009
        vmovss  -92(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -280(%ebp), %eax
Packit 6c4009
        vmovss  -88(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -276(%ebp), %eax
Packit 6c4009
        vmovss  -84(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -272(%ebp), %eax
Packit 6c4009
        vmovss  -80(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -268(%ebp), %eax
Packit 6c4009
        vmovss  -76(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -264(%ebp), %eax
Packit 6c4009
        vmovss  -72(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -260(%ebp), %eax
Packit 6c4009
        vmovss  -68(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -256(%ebp), %eax
Packit 6c4009
        vmovss  -64(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -252(%ebp), %eax
Packit 6c4009
        vmovss  -60(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -248(%ebp), %eax
Packit 6c4009
        vmovss  -56(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        movl    -244(%ebp), %eax
Packit 6c4009
        vmovss  -52(%ebp), %xmm0
Packit 6c4009
        vmovss  %xmm0, (%eax)
Packit 6c4009
        addl    $296, %esp
Packit 6c4009
        popq    %r10
Packit 6c4009
        .cfi_def_cfa 10, 0
Packit 6c4009
        popq    %rbp
Packit 6c4009
        leal    -8(%r10), %esp
Packit 6c4009
        .cfi_def_cfa 7, 8
Packit 6c4009
        ret
Packit 6c4009
#endif
Packit 6c4009
.endm
Packit 6c4009
Packit 6c4009
ENTRY (_ZGVeN16vvv_sincosf_knl)
Packit 6c4009
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl
Packit 6c4009
END (_ZGVeN16vvv_sincosf_knl)
Packit 6c4009
Packit 6c4009
ENTRY (_ZGVeN16vvv_sincosf_skx)
Packit 6c4009
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
Packit 6c4009
END (_ZGVeN16vvv_sincosf_skx)
Packit 6c4009
Packit 6c4009
	.section .rodata, "a"
Packit 6c4009
.L_2il0floatpacket.13:
Packit 6c4009
	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
Packit 6c4009
	.type	.L_2il0floatpacket.13,@object