Blame crypto/sha/asm/keccak1600-avx512.pl

Packit Service 084de1
#!/usr/bin/env perl
Packit Service 084de1
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit Service 084de1
#
Packit Service 084de1
# Licensed under the OpenSSL license (the "License").  You may not use
Packit Service 084de1
# this file except in compliance with the License.  You can obtain a copy
Packit Service 084de1
# in the file LICENSE in the source distribution or at
Packit Service 084de1
# https://www.openssl.org/source/license.html
Packit Service 084de1
#
Packit Service 084de1
# ====================================================================
Packit Service 084de1
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit Service 084de1
# project. The module is, however, dual licensed under OpenSSL and
Packit Service 084de1
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit Service 084de1
# details see http://www.openssl.org/~appro/cryptogams/.
Packit Service 084de1
# ====================================================================
Packit Service 084de1
#
Packit Service 084de1
# Keccak-1600 for AVX-512F.
Packit Service 084de1
#
Packit Service 084de1
# July 2017.
Packit Service 084de1
#
Packit Service 084de1
# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
Packit Service 084de1
# Pretty straightforward, the only "magic" is data layout in registers.
Packit Service 084de1
# It's impossible to have one that is optimal for every step, hence
Packit Service 084de1
# it's changing as algorithm progresses. Data is saved in linear order,
Packit Service 084de1
# but in-register order morphs between rounds. Even rounds take in
Packit Service 084de1
# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
Packit Service 084de1
#
Packit Service 084de1
########################################################################
Packit Service 084de1
# Numbers are cycles per processed byte out of large message.
Packit Service 084de1
#
Packit Service 084de1
#			r=1088(*)
Packit Service 084de1
#
Packit Service 084de1
# Knights Landing	7.6
Packit Service 084de1
# Skylake-X		5.7
Packit Service 084de1
#
Packit Service 084de1
# (*)	Corresponds to SHA3-256.
Packit Service 084de1
Packit Service 084de1
########################################################################
Packit Service 084de1
# Below code is combination of two ideas. One is taken from Keccak Code
Packit Service 084de1
# Package, hereafter KCP, and another one from initial version of this
Packit Service 084de1
# module. What is common is observation that Pi's input and output are
Packit Service 084de1
# "mostly transposed", i.e. if input is aligned by x coordinate, then
Packit Service 084de1
# output is [mostly] aligned by y. Both versions, KCP and predecessor,
Packit Service 084de1
# were trying to use one of them from round to round, which resulted in
Packit Service 084de1
# some kind of transposition in each round. This version still does
Packit Service 084de1
# transpose data, but only every second round. Another essential factor
Packit Service 084de1
# is that KCP transposition has to be performed with instructions that
Packit Service 084de1
# turned to be rather expensive on Knights Landing, both latency- and
Packit Service 084de1
# throughput-wise. Not to mention that some of them have to depend on
Packit Service 084de1
# each other. On the other hand initial version of this module was
Packit Service 084de1
# relying heavily on blend instructions. There were lots of them,
Packit Service 084de1
# resulting in higher instruction count, yet it performed better on
Packit Service 084de1
# Knights Landing, because processor can execute pair of them each
Packit Service 084de1
# cycle and they have minimal latency. This module is an attempt to
Packit Service 084de1
# bring best parts together:-)
Packit Service 084de1
#
Packit Service 084de1
# Coordinates below correspond to those in sha/keccak1600.c. Input
Packit Service 084de1
# layout is straight linear:
Packit Service 084de1
#
Packit Service 084de1
# [0][4] [0][3] [0][2] [0][1] [0][0]
Packit Service 084de1
# [1][4] [1][3] [1][2] [1][1] [1][0]
Packit Service 084de1
# [2][4] [2][3] [2][2] [2][1] [2][0]
Packit Service 084de1
# [3][4] [3][3] [3][2] [3][1] [3][0]
Packit Service 084de1
# [4][4] [4][3] [4][2] [4][1] [4][0]
Packit Service 084de1
#
Packit Service 084de1
# It's perfect for Theta, while Pi is reduced to intra-register
Packit Service 084de1
# permutations which yield layout perfect for Chi:
Packit Service 084de1
#
Packit Service 084de1
# [4][0] [3][0] [2][0] [1][0] [0][0]
Packit Service 084de1
# [4][1] [3][1] [2][1] [1][1] [0][1]
Packit Service 084de1
# [4][2] [3][2] [2][2] [1][2] [0][2]
Packit Service 084de1
# [4][3] [3][3] [2][3] [1][3] [0][3]
Packit Service 084de1
# [4][4] [3][4] [2][4] [1][4] [0][4]
Packit Service 084de1
#
Packit Service 084de1
# Now instead of performing full transposition and feeding it to next
Packit Service 084de1
# identical round, we perform kind of diagonal transposition to layout
Packit Service 084de1
# from initial version of this module, and make it suitable for Theta:
Packit Service 084de1
#
Packit Service 084de1
# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
Packit Service 084de1
# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
Packit Service 084de1
# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
Packit Service 084de1
# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
Packit Service 084de1
# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
Packit Service 084de1
#
Packit Service 084de1
# Now intra-register permutations yield initial [almost] straight
Packit Service 084de1
# linear layout:
Packit Service 084de1
#
Packit Service 084de1
# [4][4] [3][3] [2][2] [1][1] [0][0]
Packit Service 084de1
##[0][4] [0][3] [0][2] [0][1] [0][0]
Packit Service 084de1
# [3][4] [2][3] [1][2] [0][1] [4][0]
Packit Service 084de1
##[2][3] [2][2] [2][1] [2][0] [2][4]
Packit Service 084de1
# [2][4] [1][3] [0][2] [4][1] [3][0]
Packit Service 084de1
##[4][2] [4][1] [4][0] [4][4] [4][3]
Packit Service 084de1
# [1][4] [0][3] [4][2] [3][1] [2][0]
Packit Service 084de1
##[1][1] [1][0] [1][4] [1][3] [1][2]
Packit Service 084de1
# [0][4] [4][3] [3][2] [2][1] [1][0]
Packit Service 084de1
##[3][0] [3][4] [3][3] [3][2] [3][1]
Packit Service 084de1
#
Packit Service 084de1
# This means that odd round Chi is performed in less suitable layout,
Packit Service 084de1
# with a number of additional permutations. But overall it turned to be
Packit Service 084de1
# a win. Permutations are fastest possible on Knights Landing and they
Packit Service 084de1
# are laid down to be independent of each other. In the essence I traded
Packit Service 084de1
# 20 blend instructions for 3 permutations. The result is 13% faster
Packit Service 084de1
# than KCP on Skylake-X, and >40% on Knights Landing.
Packit Service 084de1
#
Packit Service 084de1
# As implied, data is loaded in straight linear order. Digits in
Packit Service 084de1
# variables' names represent coordinates of right-most element of
Packit Service 084de1
# loaded data chunk:
Packit Service 084de1
Packit Service 084de1
my ($A00,	# [0][4] [0][3] [0][2] [0][1] [0][0]
Packit Service 084de1
    $A10,	# [1][4] [1][3] [1][2] [1][1] [1][0]
Packit Service 084de1
    $A20,	# [2][4] [2][3] [2][2] [2][1] [2][0]
Packit Service 084de1
    $A30,	# [3][4] [3][3] [3][2] [3][1] [3][0]
Packit Service 084de1
    $A40) =	# [4][4] [4][3] [4][2] [4][1] [4][0]
Packit Service 084de1
    map("%zmm$_",(0..4));
Packit Service 084de1
Packit Service 084de1
# We also need to map the magic order into offsets within structure:
Packit Service 084de1
Packit Service 084de1
my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
Packit Service 084de1
		[1,0], [1,1], [1,2], [1,3], [1,4],
Packit Service 084de1
		[2,0], [2,1], [2,2], [2,3], [2,4],
Packit Service 084de1
		[3,0], [3,1], [3,2], [3,3], [3,4],
Packit Service 084de1
		[4,0], [4,1], [4,2], [4,3], [4,4]);
Packit Service 084de1
   @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged);	# ... and now linear
Packit Service 084de1
Packit Service 084de1
my @T        = map("%zmm$_",(5..12));
Packit Service 084de1
my @Theta    = map("%zmm$_",(33,13..16));	# invalid @Theta[0] is not typo
Packit Service 084de1
my @Pi0      = map("%zmm$_",(17..21));
Packit Service 084de1
my @Rhotate0 = map("%zmm$_",(22..26));
Packit Service 084de1
my @Rhotate1 = map("%zmm$_",(27..31));
Packit Service 084de1
Packit Service 084de1
my ($C00,$D00) = @T[0..1];
Packit Service 084de1
my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
Packit Service 084de1
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.text
Packit Service 084de1
Packit Service 084de1
.type	__KeccakF1600,\@function
Packit Service 084de1
.align	32
Packit Service 084de1
__KeccakF1600:
Packit Service 084de1
	lea		iotas(%rip),%r10
Packit Service 084de1
	mov		\$12,%eax
Packit Service 084de1
	jmp		.Loop_avx512
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Loop_avx512:
Packit Service 084de1
	######################################### Theta, even round
Packit Service 084de1
	vmovdqa64	$A00,@T[0]		# put aside original A00
Packit Service 084de1
	vpternlogq	\$0x96,$A20,$A10,$A00	# and use it as "C00"
Packit Service 084de1
	vpternlogq	\$0x96,$A40,$A30,$A00
Packit Service 084de1
Packit Service 084de1
	vprolq		\$1,$A00,$D00
Packit Service 084de1
	vpermq		$A00,@Theta[1],$A00
Packit Service 084de1
	vpermq		$D00,@Theta[4],$D00
Packit Service 084de1
Packit Service 084de1
	vpternlogq	\$0x96,$A00,$D00,@T[0]	# T[0] is original A00
Packit Service 084de1
	vpternlogq	\$0x96,$A00,$D00,$A10
Packit Service 084de1
	vpternlogq	\$0x96,$A00,$D00,$A20
Packit Service 084de1
	vpternlogq	\$0x96,$A00,$D00,$A30
Packit Service 084de1
	vpternlogq	\$0x96,$A00,$D00,$A40
Packit Service 084de1
Packit Service 084de1
	######################################### Rho
Packit Service 084de1
	vprolvq		@Rhotate0[0],@T[0],$A00	# T[0] is original A00
Packit Service 084de1
	vprolvq		@Rhotate0[1],$A10,$A10
Packit Service 084de1
	vprolvq		@Rhotate0[2],$A20,$A20
Packit Service 084de1
	vprolvq		@Rhotate0[3],$A30,$A30
Packit Service 084de1
	vprolvq		@Rhotate0[4],$A40,$A40
Packit Service 084de1
Packit Service 084de1
	######################################### Pi
Packit Service 084de1
	vpermq		$A00,@Pi0[0],$A00
Packit Service 084de1
	vpermq		$A10,@Pi0[1],$A10
Packit Service 084de1
	vpermq		$A20,@Pi0[2],$A20
Packit Service 084de1
	vpermq		$A30,@Pi0[3],$A30
Packit Service 084de1
	vpermq		$A40,@Pi0[4],$A40
Packit Service 084de1
Packit Service 084de1
	######################################### Chi
Packit Service 084de1
	vmovdqa64	$A00,@T[0]
Packit Service 084de1
	vmovdqa64	$A10,@T[1]
Packit Service 084de1
	vpternlogq	\$0xD2,$A20,$A10,$A00
Packit Service 084de1
	vpternlogq	\$0xD2,$A30,$A20,$A10
Packit Service 084de1
	vpternlogq	\$0xD2,$A40,$A30,$A20
Packit Service 084de1
	vpternlogq	\$0xD2,@T[0],$A40,$A30
Packit Service 084de1
	vpternlogq	\$0xD2,@T[1],@T[0],$A40
Packit Service 084de1
Packit Service 084de1
	######################################### Iota
Packit Service 084de1
	vpxorq		(%r10),$A00,${A00}{$k00001}
Packit Service 084de1
	lea		16(%r10),%r10
Packit Service 084de1
Packit Service 084de1
	######################################### Harmonize rounds
Packit Service 084de1
	vpblendmq	$A20,$A10,@{T[1]}{$k00010}
Packit Service 084de1
	vpblendmq	$A30,$A20,@{T[2]}{$k00010}
Packit Service 084de1
	vpblendmq	$A40,$A30,@{T[3]}{$k00010}
Packit Service 084de1
	 vpblendmq	$A10,$A00,@{T[0]}{$k00010}
Packit Service 084de1
	vpblendmq	$A00,$A40,@{T[4]}{$k00010}
Packit Service 084de1
Packit Service 084de1
	vpblendmq	$A30,@T[1],@{T[1]}{$k00100}
Packit Service 084de1
	vpblendmq	$A40,@T[2],@{T[2]}{$k00100}
Packit Service 084de1
	 vpblendmq	$A20,@T[0],@{T[0]}{$k00100}
Packit Service 084de1
	vpblendmq	$A00,@T[3],@{T[3]}{$k00100}
Packit Service 084de1
	vpblendmq	$A10,@T[4],@{T[4]}{$k00100}
Packit Service 084de1
Packit Service 084de1
	vpblendmq	$A40,@T[1],@{T[1]}{$k01000}
Packit Service 084de1
	 vpblendmq	$A30,@T[0],@{T[0]}{$k01000}
Packit Service 084de1
	vpblendmq	$A00,@T[2],@{T[2]}{$k01000}
Packit Service 084de1
	vpblendmq	$A10,@T[3],@{T[3]}{$k01000}
Packit Service 084de1
	vpblendmq	$A20,@T[4],@{T[4]}{$k01000}
Packit Service 084de1
Packit Service 084de1
	vpblendmq	$A40,@T[0],@{T[0]}{$k10000}
Packit Service 084de1
	vpblendmq	$A00,@T[1],@{T[1]}{$k10000}
Packit Service 084de1
	vpblendmq	$A10,@T[2],@{T[2]}{$k10000}
Packit Service 084de1
	vpblendmq	$A20,@T[3],@{T[3]}{$k10000}
Packit Service 084de1
	vpblendmq	$A30,@T[4],@{T[4]}{$k10000}
Packit Service 084de1
Packit Service 084de1
	#vpermq		@T[0],@Theta[0],$A00	# doesn't actually change order
Packit Service 084de1
	vpermq		@T[1],@Theta[1],$A10
Packit Service 084de1
	vpermq		@T[2],@Theta[2],$A20
Packit Service 084de1
	vpermq		@T[3],@Theta[3],$A30
Packit Service 084de1
	vpermq		@T[4],@Theta[4],$A40
Packit Service 084de1
Packit Service 084de1
	######################################### Theta, odd round
Packit Service 084de1
	vmovdqa64	$T[0],$A00		# real A00
Packit Service 084de1
	vpternlogq	\$0x96,$A20,$A10,$C00	# C00 is @T[0]'s alias
Packit Service 084de1
	vpternlogq	\$0x96,$A40,$A30,$C00
Packit Service 084de1
Packit Service 084de1
	vprolq		\$1,$C00,$D00
Packit Service 084de1
	vpermq		$C00,@Theta[1],$C00
Packit Service 084de1
	vpermq		$D00,@Theta[4],$D00
Packit Service 084de1
Packit Service 084de1
	vpternlogq	\$0x96,$C00,$D00,$A00
Packit Service 084de1
	vpternlogq	\$0x96,$C00,$D00,$A30
Packit Service 084de1
	vpternlogq	\$0x96,$C00,$D00,$A10
Packit Service 084de1
	vpternlogq	\$0x96,$C00,$D00,$A40
Packit Service 084de1
	vpternlogq	\$0x96,$C00,$D00,$A20
Packit Service 084de1
Packit Service 084de1
	######################################### Rho
Packit Service 084de1
	vprolvq		@Rhotate1[0],$A00,$A00
Packit Service 084de1
	vprolvq		@Rhotate1[3],$A30,@T[1]
Packit Service 084de1
	vprolvq		@Rhotate1[1],$A10,@T[2]
Packit Service 084de1
	vprolvq		@Rhotate1[4],$A40,@T[3]
Packit Service 084de1
	vprolvq		@Rhotate1[2],$A20,@T[4]
Packit Service 084de1
Packit Service 084de1
	 vpermq		$A00,@Theta[4],@T[5]
Packit Service 084de1
	 vpermq		$A00,@Theta[3],@T[6]
Packit Service 084de1
Packit Service 084de1
	######################################### Iota
Packit Service 084de1
	vpxorq		-8(%r10),$A00,${A00}{$k00001}
Packit Service 084de1
Packit Service 084de1
	######################################### Pi
Packit Service 084de1
	vpermq		@T[1],@Theta[2],$A10
Packit Service 084de1
	vpermq		@T[2],@Theta[4],$A20
Packit Service 084de1
	vpermq		@T[3],@Theta[1],$A30
Packit Service 084de1
	vpermq		@T[4],@Theta[3],$A40
Packit Service 084de1
Packit Service 084de1
	######################################### Chi
Packit Service 084de1
	vpternlogq	\$0xD2,@T[6],@T[5],$A00
Packit Service 084de1
Packit Service 084de1
	vpermq		@T[1],@Theta[1],@T[7]
Packit Service 084de1
	#vpermq		@T[1],@Theta[0],@T[1]
Packit Service 084de1
	vpternlogq	\$0xD2,@T[1],@T[7],$A10
Packit Service 084de1
Packit Service 084de1
	vpermq		@T[2],@Theta[3],@T[0]
Packit Service 084de1
	vpermq		@T[2],@Theta[2],@T[2]
Packit Service 084de1
	vpternlogq	\$0xD2,@T[2],@T[0],$A20
Packit Service 084de1
Packit Service 084de1
	#vpermq		@T[3],@Theta[0],@T[3]
Packit Service 084de1
	vpermq		@T[3],@Theta[4],@T[1]
Packit Service 084de1
	vpternlogq	\$0xD2,@T[1],@T[3],$A30
Packit Service 084de1
Packit Service 084de1
	vpermq		@T[4],@Theta[2],@T[0]
Packit Service 084de1
	vpermq		@T[4],@Theta[1],@T[4]
Packit Service 084de1
	vpternlogq	\$0xD2,@T[4],@T[0],$A40
Packit Service 084de1
Packit Service 084de1
	dec		%eax
Packit Service 084de1
	jnz		.Loop_avx512
Packit Service 084de1
Packit Service 084de1
	ret
Packit Service 084de1
.size	__KeccakF1600,.-__KeccakF1600
Packit Service 084de1
___
Packit Service 084de1
Packit Service 084de1
my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
Packit Service 084de1
my  $out = $inp;	# in squeeze
Packit Service 084de1
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.globl	SHA3_absorb
Packit Service 084de1
.type	SHA3_absorb,\@function
Packit Service 084de1
.align	32
Packit Service 084de1
SHA3_absorb:
Packit Service 084de1
	mov	%rsp,%r11
Packit Service 084de1
Packit Service 084de1
	lea	-320(%rsp),%rsp
Packit Service 084de1
	and	\$-64,%rsp
Packit Service 084de1
Packit Service 084de1
	lea	96($A_flat),$A_flat
Packit Service 084de1
	lea	96($inp),$inp
Packit Service 084de1
	lea	128(%rsp),%r9
Packit Service 084de1
Packit Service 084de1
	lea		theta_perm(%rip),%r8
Packit Service 084de1
Packit Service 084de1
	kxnorw		$k11111,$k11111,$k11111
Packit Service 084de1
	kshiftrw	\$15,$k11111,$k00001
Packit Service 084de1
	kshiftrw	\$11,$k11111,$k11111
Packit Service 084de1
	kshiftlw	\$1,$k00001,$k00010
Packit Service 084de1
	kshiftlw	\$2,$k00001,$k00100
Packit Service 084de1
	kshiftlw	\$3,$k00001,$k01000
Packit Service 084de1
	kshiftlw	\$4,$k00001,$k10000
Packit Service 084de1
Packit Service 084de1
	#vmovdqa64	64*0(%r8),@Theta[0]
Packit Service 084de1
	vmovdqa64	64*1(%r8),@Theta[1]
Packit Service 084de1
	vmovdqa64	64*2(%r8),@Theta[2]
Packit Service 084de1
	vmovdqa64	64*3(%r8),@Theta[3]
Packit Service 084de1
	vmovdqa64	64*4(%r8),@Theta[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	64*5(%r8),@Rhotate1[0]
Packit Service 084de1
	vmovdqa64	64*6(%r8),@Rhotate1[1]
Packit Service 084de1
	vmovdqa64	64*7(%r8),@Rhotate1[2]
Packit Service 084de1
	vmovdqa64	64*8(%r8),@Rhotate1[3]
Packit Service 084de1
	vmovdqa64	64*9(%r8),@Rhotate1[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	64*10(%r8),@Rhotate0[0]
Packit Service 084de1
	vmovdqa64	64*11(%r8),@Rhotate0[1]
Packit Service 084de1
	vmovdqa64	64*12(%r8),@Rhotate0[2]
Packit Service 084de1
	vmovdqa64	64*13(%r8),@Rhotate0[3]
Packit Service 084de1
	vmovdqa64	64*14(%r8),@Rhotate0[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	64*15(%r8),@Pi0[0]
Packit Service 084de1
	vmovdqa64	64*16(%r8),@Pi0[1]
Packit Service 084de1
	vmovdqa64	64*17(%r8),@Pi0[2]
Packit Service 084de1
	vmovdqa64	64*18(%r8),@Pi0[3]
Packit Service 084de1
	vmovdqa64	64*19(%r8),@Pi0[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
Packit Service 084de1
	vpxorq		@T[0],@T[0],@T[0]
Packit Service 084de1
	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	@T[0],0*64-128(%r9)	# zero transfer area on stack
Packit Service 084de1
	vmovdqa64	@T[0],1*64-128(%r9)
Packit Service 084de1
	vmovdqa64	@T[0],2*64-128(%r9)
Packit Service 084de1
	vmovdqa64	@T[0],3*64-128(%r9)
Packit Service 084de1
	vmovdqa64	@T[0],4*64-128(%r9)
Packit Service 084de1
	jmp		.Loop_absorb_avx512
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Loop_absorb_avx512:
Packit Service 084de1
	mov		$bsz,%rax
Packit Service 084de1
	sub		$bsz,$len
Packit Service 084de1
	jc		.Ldone_absorb_avx512
Packit Service 084de1
Packit Service 084de1
	shr		\$3,%eax
Packit Service 084de1
___
Packit Service 084de1
for(my $i=0; $i<25; $i++) {
Packit Service 084de1
$code.=<<___
Packit Service 084de1
	mov	8*$i-96($inp),%r8
Packit Service 084de1
	mov	%r8,$A_jagged[$i]-128(%r9)
Packit Service 084de1
	dec	%eax
Packit Service 084de1
	jz	.Labsorved_avx512
Packit Service 084de1
___
Packit Service 084de1
}
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.Labsorved_avx512:
Packit Service 084de1
	lea	($inp,$bsz),$inp
Packit Service 084de1
Packit Service 084de1
	vpxorq	64*0-128(%r9),$A00,$A00
Packit Service 084de1
	vpxorq	64*1-128(%r9),$A10,$A10
Packit Service 084de1
	vpxorq	64*2-128(%r9),$A20,$A20
Packit Service 084de1
	vpxorq	64*3-128(%r9),$A30,$A30
Packit Service 084de1
	vpxorq	64*4-128(%r9),$A40,$A40
Packit Service 084de1
Packit Service 084de1
	call	__KeccakF1600
Packit Service 084de1
Packit Service 084de1
	jmp	.Loop_absorb_avx512
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Ldone_absorb_avx512:
Packit Service 084de1
	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A40,40*4-96($A_flat){$k11111}
Packit Service 084de1
Packit Service 084de1
	vzeroupper
Packit Service 084de1
Packit Service 084de1
	lea	(%r11),%rsp
Packit Service 084de1
	lea	($len,$bsz),%rax		# return value
Packit Service 084de1
	ret
Packit Service 084de1
.size	SHA3_absorb,.-SHA3_absorb
Packit Service 084de1
Packit Service 084de1
.globl	SHA3_squeeze
Packit Service 084de1
.type	SHA3_squeeze,\@function
Packit Service 084de1
.align	32
Packit Service 084de1
SHA3_squeeze:
Packit Service 084de1
	mov	%rsp,%r11
Packit Service 084de1
Packit Service 084de1
	lea	96($A_flat),$A_flat
Packit Service 084de1
	cmp	$bsz,$len
Packit Service 084de1
	jbe	.Lno_output_extension_avx512
Packit Service 084de1
Packit Service 084de1
	lea		theta_perm(%rip),%r8
Packit Service 084de1
Packit Service 084de1
	kxnorw		$k11111,$k11111,$k11111
Packit Service 084de1
	kshiftrw	\$15,$k11111,$k00001
Packit Service 084de1
	kshiftrw	\$11,$k11111,$k11111
Packit Service 084de1
	kshiftlw	\$1,$k00001,$k00010
Packit Service 084de1
	kshiftlw	\$2,$k00001,$k00100
Packit Service 084de1
	kshiftlw	\$3,$k00001,$k01000
Packit Service 084de1
	kshiftlw	\$4,$k00001,$k10000
Packit Service 084de1
Packit Service 084de1
	#vmovdqa64	64*0(%r8),@Theta[0]
Packit Service 084de1
	vmovdqa64	64*1(%r8),@Theta[1]
Packit Service 084de1
	vmovdqa64	64*2(%r8),@Theta[2]
Packit Service 084de1
	vmovdqa64	64*3(%r8),@Theta[3]
Packit Service 084de1
	vmovdqa64	64*4(%r8),@Theta[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	64*5(%r8),@Rhotate1[0]
Packit Service 084de1
	vmovdqa64	64*6(%r8),@Rhotate1[1]
Packit Service 084de1
	vmovdqa64	64*7(%r8),@Rhotate1[2]
Packit Service 084de1
	vmovdqa64	64*8(%r8),@Rhotate1[3]
Packit Service 084de1
	vmovdqa64	64*9(%r8),@Rhotate1[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	64*10(%r8),@Rhotate0[0]
Packit Service 084de1
	vmovdqa64	64*11(%r8),@Rhotate0[1]
Packit Service 084de1
	vmovdqa64	64*12(%r8),@Rhotate0[2]
Packit Service 084de1
	vmovdqa64	64*13(%r8),@Rhotate0[3]
Packit Service 084de1
	vmovdqa64	64*14(%r8),@Rhotate0[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqa64	64*15(%r8),@Pi0[0]
Packit Service 084de1
	vmovdqa64	64*16(%r8),@Pi0[1]
Packit Service 084de1
	vmovdqa64	64*17(%r8),@Pi0[2]
Packit Service 084de1
	vmovdqa64	64*18(%r8),@Pi0[3]
Packit Service 084de1
	vmovdqa64	64*19(%r8),@Pi0[4]
Packit Service 084de1
Packit Service 084de1
	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
Packit Service 084de1
	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}
Packit Service 084de1
Packit Service 084de1
.Lno_output_extension_avx512:
Packit Service 084de1
	shr	\$3,$bsz
Packit Service 084de1
	lea	-96($A_flat),%r9
Packit Service 084de1
	mov	$bsz,%rax
Packit Service 084de1
	jmp	.Loop_squeeze_avx512
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Loop_squeeze_avx512:
Packit Service 084de1
	cmp	\$8,$len
Packit Service 084de1
	jb	.Ltail_squeeze_avx512
Packit Service 084de1
Packit Service 084de1
	mov	(%r9),%r8
Packit Service 084de1
	lea	8(%r9),%r9
Packit Service 084de1
	mov	%r8,($out)
Packit Service 084de1
	lea	8($out),$out
Packit Service 084de1
	sub	\$8,$len		# len -= 8
Packit Service 084de1
	jz	.Ldone_squeeze_avx512
Packit Service 084de1
Packit Service 084de1
	sub	\$1,%rax		# bsz--
Packit Service 084de1
	jnz	.Loop_squeeze_avx512
Packit Service 084de1
Packit Service 084de1
	#vpermq		@Theta[4],@Theta[4],@Theta[3]
Packit Service 084de1
	#vpermq		@Theta[3],@Theta[4],@Theta[2]
Packit Service 084de1
	#vpermq		@Theta[3],@Theta[3],@Theta[1]
Packit Service 084de1
Packit Service 084de1
	call		__KeccakF1600
Packit Service 084de1
Packit Service 084de1
	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
Packit Service 084de1
	vmovdqu64	$A40,40*4-96($A_flat){$k11111}
Packit Service 084de1
Packit Service 084de1
	lea	-96($A_flat),%r9
Packit Service 084de1
	mov	$bsz,%rax
Packit Service 084de1
	jmp	.Loop_squeeze_avx512
Packit Service 084de1
Packit Service 084de1
.Ltail_squeeze_avx512:
Packit Service 084de1
	mov	$out,%rdi
Packit Service 084de1
	mov	%r9,%rsi
Packit Service 084de1
	mov	$len,%rcx
Packit Service 084de1
	.byte	0xf3,0xa4		# rep movsb
Packit Service 084de1
Packit Service 084de1
.Ldone_squeeze_avx512:
Packit Service 084de1
	vzeroupper
Packit Service 084de1
Packit Service 084de1
	lea	(%r11),%rsp
Packit Service 084de1
	ret
Packit Service 084de1
.size	SHA3_squeeze,.-SHA3_squeeze
Packit Service 084de1
Packit Service 084de1
.align	64
Packit Service 084de1
theta_perm:
Packit Service 084de1
	.quad	0, 1, 2, 3, 4, 5, 6, 7		# [not used]
Packit Service 084de1
	.quad	4, 0, 1, 2, 3, 5, 6, 7
Packit Service 084de1
	.quad	3, 4, 0, 1, 2, 5, 6, 7
Packit Service 084de1
	.quad	2, 3, 4, 0, 1, 5, 6, 7
Packit Service 084de1
	.quad	1, 2, 3, 4, 0, 5, 6, 7
Packit Service 084de1
Packit Service 084de1
rhotates1:
Packit Service 084de1
	.quad	0,  44, 43, 21, 14, 0, 0, 0	# [0][0] [1][1] [2][2] [3][3] [4][4]
Packit Service 084de1
	.quad	18, 1,  6,  25, 8,  0, 0, 0	# [4][0] [0][1] [1][2] [2][3] [3][4]
Packit Service 084de1
	.quad	41, 2,	62, 55, 39, 0, 0, 0	# [3][0] [4][1] [0][2] [1][3] [2][4]
Packit Service 084de1
	.quad	3,  45, 61, 28, 20, 0, 0, 0	# [2][0] [3][1] [4][2] [0][3] [1][4]
Packit Service 084de1
	.quad	36, 10, 15, 56, 27, 0, 0, 0	# [1][0] [2][1] [3][2] [4][3] [0][4]
Packit Service 084de1
Packit Service 084de1
rhotates0:
Packit Service 084de1
	.quad	 0,  1, 62, 28, 27, 0, 0, 0
Packit Service 084de1
	.quad	36, 44,  6, 55, 20, 0, 0, 0
Packit Service 084de1
	.quad	 3, 10, 43, 25, 39, 0, 0, 0
Packit Service 084de1
	.quad	41, 45, 15, 21,  8, 0, 0, 0
Packit Service 084de1
	.quad	18,  2, 61, 56, 14, 0, 0, 0
Packit Service 084de1
Packit Service 084de1
pi0_perm:
Packit Service 084de1
	.quad	0, 3, 1, 4, 2, 5, 6, 7
Packit Service 084de1
	.quad	1, 4, 2, 0, 3, 5, 6, 7
Packit Service 084de1
	.quad	2, 0, 3, 1, 4, 5, 6, 7
Packit Service 084de1
	.quad	3, 1, 4, 2, 0, 5, 6, 7
Packit Service 084de1
	.quad	4, 2, 0, 3, 1, 5, 6, 7
Packit Service 084de1
Packit Service 084de1
Packit Service 084de1
iotas:
Packit Service 084de1
	.quad	0x0000000000000001
Packit Service 084de1
	.quad	0x0000000000008082
Packit Service 084de1
	.quad	0x800000000000808a
Packit Service 084de1
	.quad	0x8000000080008000
Packit Service 084de1
	.quad	0x000000000000808b
Packit Service 084de1
	.quad	0x0000000080000001
Packit Service 084de1
	.quad	0x8000000080008081
Packit Service 084de1
	.quad	0x8000000000008009
Packit Service 084de1
	.quad	0x000000000000008a
Packit Service 084de1
	.quad	0x0000000000000088
Packit Service 084de1
	.quad	0x0000000080008009
Packit Service 084de1
	.quad	0x000000008000000a
Packit Service 084de1
	.quad	0x000000008000808b
Packit Service 084de1
	.quad	0x800000000000008b
Packit Service 084de1
	.quad	0x8000000000008089
Packit Service 084de1
	.quad	0x8000000000008003
Packit Service 084de1
	.quad	0x8000000000008002
Packit Service 084de1
	.quad	0x8000000000000080
Packit Service 084de1
	.quad	0x000000000000800a
Packit Service 084de1
	.quad	0x800000008000000a
Packit Service 084de1
	.quad	0x8000000080008081
Packit Service 084de1
	.quad	0x8000000000008080
Packit Service 084de1
	.quad	0x0000000080000001
Packit Service 084de1
	.quad	0x8000000080008008
Packit Service 084de1
Packit Service 084de1
.asciz	"Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
Packit Service 084de1
___
Packit Service 084de1
Packit Service 084de1
$output=pop;
Packit Service 084de1
open STDOUT,">$output";
Packit Service 084de1
print $code;
Packit Service 084de1
close STDOUT or die "error closing STDOUT: $!";