|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
dct36_sse: AVX optimized dct64 for x86-64
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
Packit |
c32a2d |
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
Packit |
c32a2d |
initially written by Taihei Monma
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#include "mangle.h"
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#define samples %rdx
|
|
Packit |
c32a2d |
#define costab %rcx
|
|
Packit |
c32a2d |
#define out0 %rdi
|
|
Packit |
c32a2d |
#define out1 %rsi
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
/*
|
|
Packit |
c32a2d |
void dct64_avx(short *out0, short *out1, real *samples);
|
|
Packit |
c32a2d |
*/
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#ifndef __APPLE__
|
|
Packit |
c32a2d |
.section .rodata
|
|
Packit |
c32a2d |
#else
|
|
Packit |
c32a2d |
.data
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
ALIGN32
|
|
Packit |
c32a2d |
costab_avx:
|
|
Packit |
c32a2d |
.long 1056974725
|
|
Packit |
c32a2d |
.long 1057056395
|
|
Packit |
c32a2d |
.long 1057223771
|
|
Packit |
c32a2d |
.long 1057485416
|
|
Packit |
c32a2d |
.long 1057855544
|
|
Packit |
c32a2d |
.long 1058356026
|
|
Packit |
c32a2d |
.long 1059019886
|
|
Packit |
c32a2d |
.long 1059897405
|
|
Packit |
c32a2d |
.long 1061067246
|
|
Packit |
c32a2d |
.long 1062657950
|
|
Packit |
c32a2d |
.long 1064892987
|
|
Packit |
c32a2d |
.long 1066774581
|
|
Packit |
c32a2d |
.long 1069414683
|
|
Packit |
c32a2d |
.long 1073984175
|
|
Packit |
c32a2d |
.long 1079645762
|
|
Packit |
c32a2d |
.long 1092815430
|
|
Packit |
c32a2d |
.long 1057005197
|
|
Packit |
c32a2d |
.long 1057342072
|
|
Packit |
c32a2d |
.long 1058087743
|
|
Packit |
c32a2d |
.long 1059427869
|
|
Packit |
c32a2d |
.long 1061799040
|
|
Packit |
c32a2d |
.long 1065862217
|
|
Packit |
c32a2d |
.long 1071413542
|
|
Packit |
c32a2d |
.long 1084439708
|
|
Packit |
c32a2d |
.long 1057128951
|
|
Packit |
c32a2d |
.long 1058664893
|
|
Packit |
c32a2d |
.long 1063675095
|
|
Packit |
c32a2d |
.long 1076102863
|
|
Packit |
c32a2d |
.long 1057655764
|
|
Packit |
c32a2d |
.long 1067924853
|
|
Packit |
c32a2d |
.long 1060439283
|
|
Packit |
c32a2d |
.long 0
|
|
Packit |
c32a2d |
.text
|
|
Packit |
c32a2d |
ALIGN16
|
|
Packit |
c32a2d |
.globl ASM_NAME(dct64_avx)
|
|
Packit |
c32a2d |
ASM_NAME(dct64_avx):
|
|
Packit |
c32a2d |
#ifdef IS_MSABI
|
|
Packit |
c32a2d |
push %rbp
|
|
Packit |
c32a2d |
mov %rsp, %rbp
|
|
Packit |
c32a2d |
sub $112, %rsp
|
|
Packit |
c32a2d |
movaps %xmm6, (%rsp)
|
|
Packit |
c32a2d |
movaps %xmm7, 16(%rsp)
|
|
Packit |
c32a2d |
movaps %xmm8, 32(%rsp)
|
|
Packit |
c32a2d |
movaps %xmm9, 48(%rsp)
|
|
Packit |
c32a2d |
movaps %xmm10, 64(%rsp)
|
|
Packit |
c32a2d |
movaps %xmm11, 80(%rsp)
|
|
Packit |
c32a2d |
movaps %xmm12, 96(%rsp)
|
|
Packit |
c32a2d |
push %rdi
|
|
Packit |
c32a2d |
push %rsi
|
|
Packit |
c32a2d |
mov %rcx, %rdi
|
|
Packit |
c32a2d |
mov %rdx, %rsi
|
|
Packit |
c32a2d |
mov %r8, %rdx
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
leaq costab_avx(%rip), costab
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vmovups (samples), %ymm0 # input[0,1,2,3,4,5,6,7]
|
|
Packit |
c32a2d |
vmovups 32(samples), %ymm1 # input[8,9,10,11,12,13,14,15]
|
|
Packit |
c32a2d |
vperm2f128 $0x23, 64(samples), %ymm2, %ymm2
|
|
Packit |
c32a2d |
vperm2f128 $0x23, 96(samples), %ymm3, %ymm3
|
|
Packit |
c32a2d |
vshufps $0x1b, %ymm2, %ymm2, %ymm2 # input[23,22,21,20,19,18,17,16]
|
|
Packit |
c32a2d |
vshufps $0x1b, %ymm3, %ymm3, %ymm3 # input[31,30,29,28,27,26,25,24]
|
|
Packit |
c32a2d |
vsubps %ymm2, %ymm1, %ymm6
|
|
Packit |
c32a2d |
vsubps %ymm3, %ymm0, %ymm7
|
|
Packit |
c32a2d |
vaddps %ymm0, %ymm3, %ymm4 # bufs[0,1,2,3,4,5,6,7]
|
|
Packit |
c32a2d |
vaddps %ymm1, %ymm2, %ymm5 # bufs[8,9,10,11,12,13,14,15]
|
|
Packit |
c32a2d |
vmulps (costab), %ymm7, %ymm7 # bufs[31,30,29,28,27,26,25,24] cos64[0,1,2,3,4,5,6,7]
|
|
Packit |
c32a2d |
vmulps 32(costab), %ymm6, %ymm6 # bufs[23,22,21,20,19,18,17,16] cos64[8,9,10,11,12,13,14,15]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vmovaps 64(costab), %ymm8 # cos32[0,1,2,3,4,5,6,7]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vshufps $0x1b, %ymm5, %ymm5, %ymm5
|
|
Packit |
c32a2d |
vshufps $0x1b, %ymm6, %ymm6, %ymm6
|
|
Packit |
c32a2d |
vperm2f128 $0x01, %ymm5, %ymm5, %ymm5 # bufs[15,14,13,12,11,10,9,8]
|
|
Packit |
c32a2d |
vperm2f128 $0x01, %ymm6, %ymm6, %ymm6 # bufs[16,17,18,19,20,21,22,23]
|
|
Packit |
c32a2d |
vsubps %ymm5, %ymm4, %ymm1
|
|
Packit |
c32a2d |
vsubps %ymm6, %ymm7, %ymm3
|
|
Packit |
c32a2d |
vaddps %ymm5, %ymm4, %ymm0 # bufs[32,33,34,35,36,37,38,39]
|
|
Packit |
c32a2d |
vaddps %ymm6, %ymm7, %ymm2 # bufs[48,49,50,51,52,53,54,55]
|
|
Packit |
c32a2d |
vmulps %ymm1, %ymm8, %ymm1 # bufs[47,46,45,44,43,42,41,40]
|
|
Packit |
c32a2d |
vmulps %ymm3, %ymm8, %ymm3 # bufs[63,62,61,60,59,58,57,56]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vmovaps 96(costab), %ymm8 # cos16[0,1,2,3]:cos8[0,1]:cos4[0]:-
|
|
Packit |
c32a2d |
vperm2f128 $0x00, %ymm8, %ymm8, %ymm9 # cos16[0,1,2,3,0,1,2,3]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vperm2f128 $0x20, %ymm1, %ymm0, %ymm4 # bufs[32,33,34,35,47,46,45,44]
|
|
Packit |
c32a2d |
vperm2f128 $0x31, %ymm1, %ymm0, %ymm5
|
|
Packit |
c32a2d |
vshufps $0x1b, %ymm5, %ymm5, %ymm5 # bufs[39,38,37,36,40,41,42,43]
|
|
Packit |
c32a2d |
vperm2f128 $0x20, %ymm3, %ymm2, %ymm6 # bufs[48,49,50,51,63,62,61,60]
|
|
Packit |
c32a2d |
vperm2f128 $0x31, %ymm3, %ymm2, %ymm7
|
|
Packit |
c32a2d |
vshufps $0x1b, %ymm7, %ymm7, %ymm7 # bufs[55,54,53,52,56,57,58,59]
|
|
Packit |
c32a2d |
vsubps %ymm5, %ymm4, %ymm1
|
|
Packit |
c32a2d |
vsubps %ymm7, %ymm6, %ymm3
|
|
Packit |
c32a2d |
vaddps %ymm5, %ymm4, %ymm0 # bufs[0,1,2,3,8,9,10,11]
|
|
Packit |
c32a2d |
vaddps %ymm7, %ymm6, %ymm2 # bufs[16,17,18,19,24,25,26,27]
|
|
Packit |
c32a2d |
vmulps %ymm1, %ymm9, %ymm1 # bufs[7,6,5,4,15,14,13,12]
|
|
Packit |
c32a2d |
vmulps %ymm3, %ymm9, %ymm3 # bufs[23,22,21,20,31,30,29,28]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vperm2f128 $0x11, %ymm8, %ymm8, %ymm8 # cos8[0,1]:cos4[0]:-:cos8[0,1]:cos4[0]:-
|
|
Packit |
c32a2d |
vmovddup %ymm8, %ymm9 # cos8[0,1,0,1,0,1,0,1]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vunpcklps %ymm1, %ymm0, %ymm4 # bufs[0,7,1,6,8,15,9,14]
|
|
Packit |
c32a2d |
vunpckhps %ymm1, %ymm0, %ymm5 # bufs[2,5,3,4,10,13,11,12]
|
|
Packit |
c32a2d |
vunpcklps %ymm3, %ymm2, %ymm6 # bufs[16,23,17,22,24,31,25,30]
|
|
Packit |
c32a2d |
vunpckhps %ymm3, %ymm2, %ymm7 # bufs[18,21,19,20,26,29,27,28]
|
|
Packit |
c32a2d |
vshufps $0xd8, %ymm4, %ymm4, %ymm4 # bufs[0,1,7,6,8,9,15,14]
|
|
Packit |
c32a2d |
vshufps $0x72, %ymm5, %ymm5, %ymm5 # bufs[3,2,4,5,11,10,12,13]
|
|
Packit |
c32a2d |
vshufps $0xd8, %ymm6, %ymm6, %ymm6 # bufs[16,17,23,22,24,25,31,30]
|
|
Packit |
c32a2d |
vshufps $0x72, %ymm7, %ymm7, %ymm7 # bufs[19,18,20,21,27,26,28,29]
|
|
Packit |
c32a2d |
vsubps %ymm5, %ymm4, %ymm1
|
|
Packit |
c32a2d |
vsubps %ymm7, %ymm6, %ymm3
|
|
Packit |
c32a2d |
vaddps %ymm5, %ymm4, %ymm0 # bufs[32,33,36,37,40,41,44,45]
|
|
Packit |
c32a2d |
vaddps %ymm7, %ymm6, %ymm2 # bufs[48,49,52,53,56,57,60,61]
|
|
Packit |
c32a2d |
vmulps %ymm1, %ymm9, %ymm1 # bufs[35,34,39,38,43,42,47,46]
|
|
Packit |
c32a2d |
vmulps %ymm3, %ymm9, %ymm3 # bufs[51,50,55,54,59,58,63,62]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vpermilps $0xaa, %ymm8, %ymm8 # cos4[0,0,0,0,0,0,0,0]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vshufps $0xd8, %ymm0, %ymm0, %ymm0 # bufs[32,36,33,37,40,44,41,45]
|
|
Packit |
c32a2d |
vshufps $0xd8, %ymm1, %ymm1, %ymm1 # bufs[35,39,34,38,43,47,42,46]
|
|
Packit |
c32a2d |
vshufps $0xd8, %ymm2, %ymm2, %ymm2 # bufs[48,52,49,53,56,60,57,61]
|
|
Packit |
c32a2d |
vshufps $0xd8, %ymm3, %ymm3, %ymm3 # bufs[51,55,50,54,59,63,58,62]
|
|
Packit |
c32a2d |
vunpcklps %ymm1, %ymm0, %ymm4 # bufs[32,35,36,39,40,43,44,47]
|
|
Packit |
c32a2d |
vunpckhps %ymm1, %ymm0, %ymm5 # bufs[33,34,37,38,41,42,45,46]
|
|
Packit |
c32a2d |
vunpcklps %ymm3, %ymm2, %ymm6 # bufs[48,51,52,55,56,59,60,63]
|
|
Packit |
c32a2d |
vunpckhps %ymm3, %ymm2, %ymm7 # bufs[49,50,53,54,57,58,61,62]
|
|
Packit |
c32a2d |
vsubps %ymm5, %ymm4, %ymm1
|
|
Packit |
c32a2d |
vsubps %ymm7, %ymm6, %ymm3
|
|
Packit |
c32a2d |
vaddps %ymm5, %ymm4, %ymm0 # bufs[0,2,4,6,8,10,12,14]
|
|
Packit |
c32a2d |
vaddps %ymm7, %ymm6, %ymm2 # bufs[16,18,20,22,24,26,28,30]
|
|
Packit |
c32a2d |
vmulps %ymm1, %ymm8, %ymm1 # bufs[1,3,5,7,9,11,13,15]
|
|
Packit |
c32a2d |
vmulps %ymm3, %ymm8, %ymm3 # bufs[17,19,21,23,25,27,29,31]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vxorps %ymm8, %ymm8, %ymm8
|
|
Packit |
c32a2d |
vblendps $0xaa, %ymm1, %ymm8, %ymm5
|
|
Packit |
c32a2d |
vblendps $0xaa, %ymm3, %ymm8, %ymm6
|
|
Packit |
c32a2d |
vaddps %ymm5, %ymm0, %ymm0
|
|
Packit |
c32a2d |
vaddps %ymm6, %ymm2, %ymm2
|
|
Packit |
c32a2d |
vunpcklps %ymm1, %ymm0, %ymm4 # bufs[0,1,2,3,8,9,10,11]
|
|
Packit |
c32a2d |
vunpckhps %ymm1, %ymm0, %ymm5 # bufs[4,5,6,7,12,13,14,15]
|
|
Packit |
c32a2d |
vunpcklps %ymm3, %ymm2, %ymm6 # bufs[16,17,18,19,24,25,26,27]
|
|
Packit |
c32a2d |
vunpckhps %ymm3, %ymm2, %ymm7 # bufs[20,21,22,23,28,29,30,31]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vextractf128 $0x1, %ymm4, %xmm0 # bufs[8,9,10,11]
|
|
Packit |
c32a2d |
vextractf128 $0x1, %ymm5, %xmm1 # bufs[12,13,14,15]
|
|
Packit |
c32a2d |
vextractf128 $0x1, %ymm6, %xmm2 # bufs[24,25,26,27]
|
|
Packit |
c32a2d |
vextractf128 $0x1, %ymm7, %xmm3 # bufs[28,29,30,31]
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm5, %xmm5, %xmm9 # bufs[6,7,5,4]
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm1, %xmm1, %xmm10 # bufs[14,15,13,12]
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm7, %xmm7, %xmm11 # bufs[22,23,21,20]
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm3, %xmm3, %xmm12 # bufs[30,31,29,28]
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm9, %xmm8, %xmm9 # bufs[6,7,5,-]
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm10, %xmm8, %xmm10 # bufs[14,15,13,-]
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm11, %xmm8, %xmm11 # bufs[22,23,21,-]
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm12, %xmm8, %xmm12 # bufs[30,31,29,-]
|
|
Packit |
c32a2d |
vaddps %xmm5, %xmm9, %xmm5
|
|
Packit |
c32a2d |
vaddps %xmm1, %xmm10, %xmm1
|
|
Packit |
c32a2d |
vaddps %xmm7, %xmm11, %xmm7
|
|
Packit |
c32a2d |
vaddps %xmm3, %xmm12, %xmm3
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
prefetcht0 512(out0)
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm0, %xmm0, %xmm9 # bufs[10,11,9,8]
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm2, %xmm2, %xmm10 # bufs[26,27,25,24]
|
|
Packit |
c32a2d |
vaddps %xmm1, %xmm0, %xmm0
|
|
Packit |
c32a2d |
vaddps %xmm3, %xmm2, %xmm2
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm9, %xmm8, %xmm9 # bufs[10,11,9,-]
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm10, %xmm8, %xmm10 # bufs[26,27,25,-]
|
|
Packit |
c32a2d |
vaddps %xmm1, %xmm9, %xmm1
|
|
Packit |
c32a2d |
vaddps %xmm3, %xmm10, %xmm3
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vzeroupper
|
|
Packit |
c32a2d |
prefetcht0 512(out1)
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
cvtps2dq %xmm4, %xmm4
|
|
Packit |
c32a2d |
cvtps2dq %xmm0, %xmm0
|
|
Packit |
c32a2d |
cvtps2dq %xmm5, %xmm5
|
|
Packit |
c32a2d |
cvtps2dq %xmm1, %xmm1
|
|
Packit |
c32a2d |
packssdw %xmm5, %xmm4
|
|
Packit |
c32a2d |
packssdw %xmm1, %xmm0
|
|
Packit |
c32a2d |
movq %xmm4, %rcx
|
|
Packit |
c32a2d |
pshufd $0x4e, %xmm4, %xmm5
|
|
Packit |
c32a2d |
movq %xmm0, %rdx
|
|
Packit |
c32a2d |
pshufd $0x4e, %xmm0, %xmm1
|
|
Packit |
c32a2d |
movq %xmm5, %r8
|
|
Packit |
c32a2d |
movq %xmm1, %r9
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
addq $512, out0
|
|
Packit |
c32a2d |
movq $-64, %rax
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
movw %cx, (out0)
|
|
Packit |
c32a2d |
movw %dx, (out0,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out0,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, -64(out0,%rax,2)
|
|
Packit |
c32a2d |
leaq (out0,%rax,4), out0
|
|
Packit |
c32a2d |
shr $16, %rcx
|
|
Packit |
c32a2d |
shr $16, %rdx
|
|
Packit |
c32a2d |
shr $16, %r8
|
|
Packit |
c32a2d |
shr $16, %r9
|
|
Packit |
c32a2d |
movw %cx, (out0,%rax,4)
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
movw %cx, (out1)
|
|
Packit |
c32a2d |
movw %dx, (out1,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out1,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, 64(out1,%rax,2)
|
|
Packit |
c32a2d |
leaq (out1,%rax,4), out1
|
|
Packit |
c32a2d |
shr $16, %rcx
|
|
Packit |
c32a2d |
shr $16, %rdx
|
|
Packit |
c32a2d |
shr $16, %r8
|
|
Packit |
c32a2d |
shr $16, %r9
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
movw %cx, (out0)
|
|
Packit |
c32a2d |
movw %dx, (out0,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out0,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, -64(out0,%rax,2)
|
|
Packit |
c32a2d |
shr $16, %rcx
|
|
Packit |
c32a2d |
shr $16, %rdx
|
|
Packit |
c32a2d |
shr $16, %r8
|
|
Packit |
c32a2d |
shr $16, %r9
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
movw %cx, (out1)
|
|
Packit |
c32a2d |
movw %dx, (out1,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out1,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, 64(out1,%rax,2)
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
leaq -32(out0,%rax,4), out0
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
leaq 32(out1,%rax,4), out1
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
vshufps $0x1e, %xmm6, %xmm6, %xmm0
|
|
Packit |
c32a2d |
vblendps $0x7, %xmm0, %xmm8, %xmm0
|
|
Packit |
c32a2d |
addps %xmm2, %xmm6
|
|
Packit |
c32a2d |
addps %xmm7, %xmm2
|
|
Packit |
c32a2d |
addps %xmm3, %xmm7
|
|
Packit |
c32a2d |
addps %xmm0, %xmm3
|
|
Packit |
c32a2d |
cvtps2dq %xmm6, %xmm6
|
|
Packit |
c32a2d |
cvtps2dq %xmm2, %xmm2
|
|
Packit |
c32a2d |
cvtps2dq %xmm7, %xmm7
|
|
Packit |
c32a2d |
cvtps2dq %xmm3, %xmm3
|
|
Packit |
c32a2d |
packssdw %xmm7, %xmm6
|
|
Packit |
c32a2d |
packssdw %xmm3, %xmm2
|
|
Packit |
c32a2d |
movq %xmm6, %rcx
|
|
Packit |
c32a2d |
pshufd $0x4e, %xmm6, %xmm7
|
|
Packit |
c32a2d |
movq %xmm2, %rdx
|
|
Packit |
c32a2d |
pshufd $0x4e, %xmm2, %xmm3
|
|
Packit |
c32a2d |
movq %xmm7, %r8
|
|
Packit |
c32a2d |
movq %xmm3, %r9
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
movw %cx, (out0)
|
|
Packit |
c32a2d |
movw %dx, (out0,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out0,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, -64(out0,%rax,2)
|
|
Packit |
c32a2d |
leaq (out0,%rax,4), out0
|
|
Packit |
c32a2d |
shr $16, %rcx
|
|
Packit |
c32a2d |
shr $16, %rdx
|
|
Packit |
c32a2d |
shr $16, %r8
|
|
Packit |
c32a2d |
shr $16, %r9
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
movw %cx, (out1)
|
|
Packit |
c32a2d |
movw %dx, (out1,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out1,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, 64(out1,%rax,2)
|
|
Packit |
c32a2d |
leaq (out1,%rax,4), out1
|
|
Packit |
c32a2d |
shr $16, %rcx
|
|
Packit |
c32a2d |
shr $16, %rdx
|
|
Packit |
c32a2d |
shr $16, %r8
|
|
Packit |
c32a2d |
shr $16, %r9
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
movw %cx, (out0)
|
|
Packit |
c32a2d |
movw %dx, (out0,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out0,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, -64(out0,%rax,2)
|
|
Packit |
c32a2d |
shr $16, %rcx
|
|
Packit |
c32a2d |
shr $16, %rdx
|
|
Packit |
c32a2d |
shr $16, %r8
|
|
Packit |
c32a2d |
shr $16, %r9
|
|
Packit |
c32a2d |
negq %rax
|
|
Packit |
c32a2d |
movw %cx, (out1)
|
|
Packit |
c32a2d |
movw %dx, (out1,%rax,1)
|
|
Packit |
c32a2d |
movw %r8w, (out1,%rax,2)
|
|
Packit |
c32a2d |
movw %r9w, 64(out1,%rax,2)
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
#ifdef IS_MSABI
|
|
Packit |
c32a2d |
pop %rsi
|
|
Packit |
c32a2d |
pop %rdi
|
|
Packit |
c32a2d |
movaps (%rsp), %xmm6
|
|
Packit |
c32a2d |
movaps 16(%rsp), %xmm7
|
|
Packit |
c32a2d |
movaps 32(%rsp), %xmm8
|
|
Packit |
c32a2d |
movaps 48(%rsp), %xmm9
|
|
Packit |
c32a2d |
movaps 64(%rsp), %xmm10
|
|
Packit |
c32a2d |
movaps 80(%rsp), %xmm11
|
|
Packit |
c32a2d |
movaps 96(%rsp), %xmm12
|
|
Packit |
c32a2d |
mov %rbp, %rsp
|
|
Packit |
c32a2d |
pop %rbp
|
|
Packit |
c32a2d |
#endif
|
|
Packit |
c32a2d |
ret
|
|
Packit |
c32a2d |
|
|
Packit |
c32a2d |
NONEXEC_STACK
|