Blame crypto/sha/asm/keccak1600-avx2.pl

Packit c4476c
#!/usr/bin/env perl
Packit c4476c
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# Keccak-1600 for AVX2.
Packit c4476c
#
Packit c4476c
# July 2017.
Packit c4476c
#
Packit c4476c
# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
Packit c4476c
# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
Packit c4476c
# other than A[0][0] in magic order into 6 [256-bit] registers, *each
Packit c4476c
# dedicated to one axis*, Pi permutation is reduced to intra-register
Packit c4476c
# shuffles...
Packit c4476c
#
Packit c4476c
# It makes other steps more intricate, but overall, is it a win? To be
Packit c4476c
# more specific index permutations organized by quadruples are:
Packit c4476c
#
Packit c4476c
#       [4][4] [3][3] [2][2] [1][1]<-+
Packit c4476c
#       [0][4] [0][3] [0][2] [0][1]<-+
Packit c4476c
#       [3][0] [1][0] [4][0] [2][0]  |
Packit c4476c
#       [4][3] [3][1] [2][4] [1][2]  |
Packit c4476c
#       [3][4] [1][3] [4][2] [2][1]  |
Packit c4476c
#       [2][3] [4][1] [1][4] [3][2]  |
Packit c4476c
#       [2][2] [4][4] [1][1] [3][3] -+
Packit c4476c
#
Packit c4476c
# This however is highly impractical for Theta and Chi. What would help
Packit c4476c
# Theta is if x indices were aligned column-wise, or in other words:
Packit c4476c
#
Packit c4476c
#       [0][4] [0][3] [0][2] [0][1]
Packit c4476c
#       [3][0] [1][0] [4][0] [2][0]
Packit c4476c
#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
Packit c4476c
#       [2][4] [4][3] [1][2] [3][1]
Packit c4476c
#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
Packit c4476c
#       [3][4] [1][3] [4][2] [2][1]
Packit c4476c
#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
Packit c4476c
#       [1][4] [2][3] [3][2] [4][1]
Packit c4476c
#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
Packit c4476c
#       [4][4] [3][3] [2][2] [1][1]
Packit c4476c
#
Packit c4476c
# So here we have it, lines not marked with vpermq() represent the magic
Packit c4476c
# order in which data is to be loaded and maintained. [And lines marked
Packit c4476c
# with vpermq() represent Pi circular permutation in chosen layout. Note
Packit c4476c
# that first step is permutation-free.] A[0][0] is loaded to register of
Packit c4476c
# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
Packit c4476c
# Digits in variables' names denote right-most coordinates:
Packit c4476c
Packit c4476c
my ($A00,	# [0][0] [0][0] [0][0] [0][0]		# %ymm0
Packit c4476c
    $A01,	# [0][4] [0][3] [0][2] [0][1]		# %ymm1
Packit c4476c
    $A20,	# [3][0] [1][0] [4][0] [2][0]		# %ymm2
Packit c4476c
    $A31,	# [2][4] [4][3] [1][2] [3][1]		# %ymm3
Packit c4476c
    $A21,	# [3][4] [1][3] [4][2] [2][1]		# %ymm4
Packit c4476c
    $A41,	# [1][4] [2][3] [3][2] [4][1]		# %ymm5
Packit c4476c
    $A11) =	# [4][4] [3][3] [2][2] [1][1]		# %ymm6
Packit c4476c
    map("%ymm$_",(0..6));
Packit c4476c
Packit c4476c
# We also need to map the magic order into offsets within structure:
Packit c4476c
Packit c4476c
my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],	# [0][0..4]
Packit c4476c
		[2,2], [6,0], [3,1], [4,2], [5,3],	# [1][0..4]
Packit c4476c
		[2,0], [4,0], [6,1], [5,2], [3,3],	# [2][0..4]
Packit c4476c
		[2,3], [3,0], [5,1], [6,2], [4,3],	# [3][0..4]
Packit c4476c
		[2,1], [5,0], [4,1], [3,2], [6,3]);	# [4][0..4]
Packit c4476c
   @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);	# ... and now linear
Packit c4476c
Packit c4476c
# But on the other hand Chi is much better off if y indices were aligned
Packit c4476c
# column-wise, not x. For this reason we have to shuffle data prior
Packit c4476c
# Chi and revert it afterwards. Prior shuffle is naturally merged with
Packit c4476c
# Pi itself:
Packit c4476c
#
Packit c4476c
#       [0][4] [0][3] [0][2] [0][1]
Packit c4476c
#       [3][0] [1][0] [4][0] [2][0]
Packit c4476c
#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
Packit c4476c
#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
Packit c4476c
#       [3][1] [1][2] [4][3] [2][4]
Packit c4476c
#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
Packit c4476c
#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
Packit c4476c
#       [3][4] [1][3] [4][2] [2][1]
Packit c4476c
#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
Packit c4476c
#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
Packit c4476c
#       [3][2] [1][4] [4][1] [2][3]
Packit c4476c
#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
Packit c4476c
#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
Packit c4476c
#       [3][3] [1][1] [4][4] [2][2]
Packit c4476c
#
Packit c4476c
# And reverse post-Chi permutation:
Packit c4476c
#
Packit c4476c
#       [0][4] [0][3] [0][2] [0][1]
Packit c4476c
#       [3][0] [1][0] [4][0] [2][0]
Packit c4476c
#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
Packit c4476c
#       [2][4] [4][3] [1][2] [3][1]
Packit c4476c
#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
Packit c4476c
#       [3][4] [1][3] [4][2] [2][1]
Packit c4476c
#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
Packit c4476c
#       [1][4] [2][3] [3][2] [4][1]
Packit c4476c
#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
Packit c4476c
#       [4][4] [3][3] [2][2] [1][1]
Packit c4476c
#
Packit c4476c
########################################################################
Packit c4476c
# Numbers are cycles per processed byte out of large message.
Packit c4476c
#
Packit c4476c
#			r=1088(*)
Packit c4476c
#
Packit c4476c
# Haswell		8.7/+10%
Packit c4476c
# Skylake		7.8/+20%
Packit c4476c
# Ryzen			17(**)
Packit c4476c
#
Packit c4476c
# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
Packit c4476c
#	coefficient in comparison to scalar keccak1600-x86_64.pl.
Packit c4476c
# (**)	It's expected that Ryzen performs poorly, because instruction
Packit c4476c
#	issue rate is limited to two AVX2 instructions per cycle and
Packit c4476c
#	in addition vpblendd is reportedly bound to specific port.
Packit c4476c
#	Obviously this code path should not be executed on Ryzen.
Packit c4476c
Packit c4476c
my @T = map("%ymm$_",(7..15));
Packit c4476c
my ($C14,$C00,$D00,$D14) = @T[5..8];
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.text
Packit c4476c
Packit c4476c
.type	__KeccakF1600,\@function
Packit c4476c
.align	32
Packit c4476c
__KeccakF1600:
Packit c4476c
	lea		rhotates_left+96(%rip),%r8
Packit c4476c
	lea		rhotates_right+96(%rip),%r9
Packit c4476c
	lea		iotas(%rip),%r10
Packit c4476c
	mov		\$24,%eax
Packit c4476c
	jmp		.Loop_avx2
Packit c4476c
Packit c4476c
.align	32
Packit c4476c
.Loop_avx2:
Packit c4476c
	######################################### Theta
Packit c4476c
	vpshufd		\$0b01001110,$A20,$C00
Packit c4476c
	vpxor		$A31,$A41,$C14
Packit c4476c
	vpxor		$A11,$A21,@T[2]
Packit c4476c
	vpxor		$A01,$C14,$C14
Packit c4476c
	vpxor		@T[2],$C14,$C14		# C[1..4]
Packit c4476c
Packit c4476c
	vpermq		\$0b10010011,$C14,@T[4]
Packit c4476c
	vpxor		$A20,$C00,$C00
Packit c4476c
	vpermq		\$0b01001110,$C00,@T[0]
Packit c4476c
Packit c4476c
	vpsrlq		\$63,$C14,@T[1]
Packit c4476c
	vpaddq		$C14,$C14,@T[2]
Packit c4476c
	vpor		@T[2],@T[1],@T[1]	# ROL64(C[1..4],1)
Packit c4476c
Packit c4476c
	vpermq		\$0b00111001,@T[1],$D14
Packit c4476c
	vpxor		@T[4],@T[1],$D00
Packit c4476c
	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]
Packit c4476c
Packit c4476c
	vpxor		$A00,$C00,$C00
Packit c4476c
	vpxor		@T[0],$C00,$C00		# C[0..0]
Packit c4476c
Packit c4476c
	vpsrlq		\$63,$C00,@T[0]
Packit c4476c
	vpaddq		$C00,$C00,@T[1]
Packit c4476c
	vpor		@T[0],@T[1],@T[1]	# ROL64(C[0..0],1)
Packit c4476c
Packit c4476c
	vpxor		$D00,$A20,$A20		# ^= D[0..0]
Packit c4476c
	vpxor		$D00,$A00,$A00		# ^= D[0..0]
Packit c4476c
Packit c4476c
	vpblendd	\$0b11000000,@T[1],$D14,$D14
Packit c4476c
	vpblendd	\$0b00000011,$C00,@T[4],@T[4]
Packit c4476c
	vpxor		@T[4],$D14,$D14		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
Packit c4476c
Packit c4476c
	######################################### Rho + Pi + pre-Chi shuffle
Packit c4476c
	vpsllvq		0*32-96(%r8),$A20,@T[3]
Packit c4476c
	vpsrlvq		0*32-96(%r9),$A20,$A20
Packit c4476c
	vpor		@T[3],$A20,$A20
Packit c4476c
Packit c4476c
	 vpxor		$D14,$A31,$A31		# ^= D[1..4] from Theta
Packit c4476c
	vpsllvq		2*32-96(%r8),$A31,@T[4]
Packit c4476c
	vpsrlvq		2*32-96(%r9),$A31,$A31
Packit c4476c
	vpor		@T[4],$A31,$A31
Packit c4476c
Packit c4476c
	 vpxor		$D14,$A21,$A21		# ^= D[1..4] from Theta
Packit c4476c
	vpsllvq		3*32-96(%r8),$A21,@T[5]
Packit c4476c
	vpsrlvq		3*32-96(%r9),$A21,$A21
Packit c4476c
	vpor		@T[5],$A21,$A21
Packit c4476c
Packit c4476c
	 vpxor		$D14,$A41,$A41		# ^= D[1..4] from Theta
Packit c4476c
	vpsllvq		4*32-96(%r8),$A41,@T[6]
Packit c4476c
	vpsrlvq		4*32-96(%r9),$A41,$A41
Packit c4476c
	vpor		@T[6],$A41,$A41
Packit c4476c
Packit c4476c
	 vpxor		$D14,$A11,$A11		# ^= D[1..4] from Theta
Packit c4476c
	 vpermq		\$0b10001101,$A20,@T[3]	# $A20 -> future $A31
Packit c4476c
	 vpermq		\$0b10001101,$A31,@T[4]	# $A31 -> future $A21
Packit c4476c
	vpsllvq		5*32-96(%r8),$A11,@T[7]
Packit c4476c
	vpsrlvq		5*32-96(%r9),$A11,@T[1]
Packit c4476c
	vpor		@T[7],@T[1],@T[1]	# $A11 -> future $A01
Packit c4476c
Packit c4476c
	 vpxor		$D14,$A01,$A01		# ^= D[1..4] from Theta
Packit c4476c
	 vpermq		\$0b00011011,$A21,@T[5]	# $A21 -> future $A41
Packit c4476c
	 vpermq		\$0b01110010,$A41,@T[6]	# $A41 -> future $A11
Packit c4476c
	vpsllvq		1*32-96(%r8),$A01,@T[8]
Packit c4476c
	vpsrlvq		1*32-96(%r9),$A01,@T[2]
Packit c4476c
	vpor		@T[8],@T[2],@T[2]	# $A01 -> future $A20
Packit c4476c
Packit c4476c
	######################################### Chi
Packit c4476c
	vpsrldq		\$8,@T[1],@T[7]
Packit c4476c
	vpandn		@T[7],@T[1],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]
Packit c4476c
Packit c4476c
	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
Packit c4476c
	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
Packit c4476c
	 vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
Packit c4476c
	 vpblendd	\$0b00001100,@T[3],@T[2],@T[7]	#               [4][3] [2][0]
Packit c4476c
	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
Packit c4476c
	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
Packit c4476c
	 vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
Packit c4476c
	 vpblendd	\$0b00110000,@T[6],@T[7],@T[7]	#        [1][1] [4][3] [2][0]
Packit c4476c
	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
Packit c4476c
	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
Packit c4476c
	 vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
Packit c4476c
	 vpblendd	\$0b11000000,@T[4],@T[7],@T[7]	# [3][4] [1][1] [4][3] [2][0]
Packit c4476c
	vpandn		@T[8],$A31,$A31		# tgting  [3][1] [1][2] [4][3] [2][4]
Packit c4476c
	 vpandn		@T[7],$A41,$A41		# tgting  [3][2] [1][4] [4][1] [2][3]
Packit c4476c
Packit c4476c
	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
Packit c4476c
	vpblendd	\$0b00001100,@T[5],@T[3],@T[8]	#               [4][1] [2][4]
Packit c4476c
	 vpxor		@T[3],$A31,$A31
Packit c4476c
	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
Packit c4476c
	vpblendd	\$0b00110000,@T[4],@T[8],@T[8]	#        [1][3] [4][1] [2][4]
Packit c4476c
	 vpxor		@T[5],$A41,$A41
Packit c4476c
	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
Packit c4476c
	vpblendd	\$0b11000000,@T[2],@T[8],@T[8]	# [3][0] [1][3] [4][1] [2][4]
Packit c4476c
	vpandn		@T[8],$A11,$A11		# tgting  [3][3] [1][1] [4][4] [2][2]
Packit c4476c
	vpxor		@T[6],$A11,$A11
Packit c4476c
Packit c4476c
	  vpermq	\$0b00011110,@T[1],$A21		# [0][1] [0][2] [0][4] [0][3]
Packit c4476c
	  vpblendd	\$0b00110000,$A00,$A21,@T[8]	# [0][1] [0][0] [0][4] [0][3]
Packit c4476c
	  vpermq	\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
Packit c4476c
	  vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
Packit c4476c
	  vpandn	@T[8],$A01,$A01		# tgting  [0][4] [0][3] [0][2] [0][1]
Packit c4476c
Packit c4476c
	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
Packit c4476c
	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
Packit c4476c
	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
Packit c4476c
	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
Packit c4476c
	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
Packit c4476c
	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
Packit c4476c
	vpandn		@T[7],$A20,$A20		# tgting  [3][0] [1][0] [4][0] [2][0]
Packit c4476c
	vpxor		@T[2],$A20,$A20
Packit c4476c
Packit c4476c
	 vpermq		\$0b00000000,@T[0],@T[0]	# [0][0] [0][0] [0][0] [0][0]
Packit c4476c
	 vpermq		\$0b00011011,$A31,$A31	# post-Chi shuffle
Packit c4476c
	 vpermq		\$0b10001101,$A41,$A41
Packit c4476c
	 vpermq		\$0b01110010,$A11,$A11
Packit c4476c
Packit c4476c
	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
Packit c4476c
	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
Packit c4476c
	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
Packit c4476c
	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
Packit c4476c
	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
Packit c4476c
	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
Packit c4476c
	vpandn		@T[7],$A21,$A21		# tgting  [3][4] [1][3] [4][2] [2][1]
Packit c4476c
Packit c4476c
	vpxor		@T[0],$A00,$A00
Packit c4476c
	vpxor		@T[1],$A01,$A01
Packit c4476c
	vpxor		@T[4],$A21,$A21
Packit c4476c
Packit c4476c
	######################################### Iota
Packit c4476c
	vpxor		(%r10),$A00,$A00
Packit c4476c
	lea		32(%r10),%r10
Packit c4476c
Packit c4476c
	dec		%eax
Packit c4476c
	jnz		.Loop_avx2
Packit c4476c
Packit c4476c
	ret
Packit c4476c
.size	__KeccakF1600,.-__KeccakF1600
Packit c4476c
___
Packit c4476c
my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
Packit c4476c
my  $out = $inp;	# in squeeze
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_absorb
Packit c4476c
.type	SHA3_absorb,\@function
Packit c4476c
.align	32
Packit c4476c
SHA3_absorb:
Packit c4476c
	mov	%rsp,%r11
Packit c4476c
Packit c4476c
	lea	-240(%rsp),%rsp
Packit c4476c
	and	\$-32,%rsp
Packit c4476c
Packit c4476c
	lea	96($A_flat),$A_flat
Packit c4476c
	lea	96($inp),$inp
Packit c4476c
	lea	96(%rsp),%r10
Packit c4476c
Packit c4476c
	vzeroupper
Packit c4476c
Packit c4476c
	vpbroadcastq	-96($A_flat),$A00	# load A[5][5]
Packit c4476c
	vmovdqu		8+32*0-96($A_flat),$A01
Packit c4476c
	vmovdqu		8+32*1-96($A_flat),$A20
Packit c4476c
	vmovdqu		8+32*2-96($A_flat),$A31
Packit c4476c
	vmovdqu		8+32*3-96($A_flat),$A21
Packit c4476c
	vmovdqu		8+32*4-96($A_flat),$A41
Packit c4476c
	vmovdqu		8+32*5-96($A_flat),$A11
Packit c4476c
Packit c4476c
	vpxor		@T[0],@T[0],@T[0]
Packit c4476c
	vmovdqa		@T[0],32*2-96(%r10)	# zero transfer area on stack
Packit c4476c
	vmovdqa		@T[0],32*3-96(%r10)
Packit c4476c
	vmovdqa		@T[0],32*4-96(%r10)
Packit c4476c
	vmovdqa		@T[0],32*5-96(%r10)
Packit c4476c
	vmovdqa		@T[0],32*6-96(%r10)
Packit c4476c
Packit c4476c
.Loop_absorb_avx2:
Packit c4476c
	mov		$bsz,%rax
Packit c4476c
	sub		$bsz,$len
Packit c4476c
	jc		.Ldone_absorb_avx2
Packit c4476c
Packit c4476c
	shr		\$3,%eax
Packit c4476c
	vpbroadcastq	0-96($inp),@T[0]
Packit c4476c
	vmovdqu		8-96($inp),@T[1]
Packit c4476c
	sub		\$4,%eax
Packit c4476c
___
Packit c4476c
for(my $i=5; $i<25; $i++) {
Packit c4476c
$code.=<<___
Packit c4476c
	dec	%eax
Packit c4476c
	jz	.Labsorved_avx2
Packit c4476c
	mov	8*$i-96($inp),%r8
Packit c4476c
	mov	%r8,$A_jagged[$i]-96(%r10)
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
.Labsorved_avx2:
Packit c4476c
	lea	($inp,$bsz),$inp
Packit c4476c
Packit c4476c
	vpxor	@T[0],$A00,$A00
Packit c4476c
	vpxor	@T[1],$A01,$A01
Packit c4476c
	vpxor	32*2-96(%r10),$A20,$A20
Packit c4476c
	vpxor	32*3-96(%r10),$A31,$A31
Packit c4476c
	vpxor	32*4-96(%r10),$A21,$A21
Packit c4476c
	vpxor	32*5-96(%r10),$A41,$A41
Packit c4476c
	vpxor	32*6-96(%r10),$A11,$A11
Packit c4476c
Packit c4476c
	call	__KeccakF1600
Packit c4476c
Packit c4476c
	lea	96(%rsp),%r10
Packit c4476c
	jmp	.Loop_absorb_avx2
Packit c4476c
Packit c4476c
.Ldone_absorb_avx2:
Packit c4476c
	vmovq	%xmm0,-96($A_flat)
Packit c4476c
	vmovdqu	$A01,8+32*0-96($A_flat)
Packit c4476c
	vmovdqu	$A20,8+32*1-96($A_flat)
Packit c4476c
	vmovdqu	$A31,8+32*2-96($A_flat)
Packit c4476c
	vmovdqu	$A21,8+32*3-96($A_flat)
Packit c4476c
	vmovdqu	$A41,8+32*4-96($A_flat)
Packit c4476c
	vmovdqu	$A11,8+32*5-96($A_flat)
Packit c4476c
Packit c4476c
	vzeroupper
Packit c4476c
Packit c4476c
	lea	(%r11),%rsp
Packit c4476c
	lea	($len,$bsz),%rax		# return value
Packit c4476c
	ret
Packit c4476c
.size	SHA3_absorb,.-SHA3_absorb
Packit c4476c
Packit c4476c
.globl	SHA3_squeeze
Packit c4476c
.type	SHA3_squeeze,\@function
Packit c4476c
.align	32
Packit c4476c
SHA3_squeeze:
Packit c4476c
	mov	%rsp,%r11
Packit c4476c
Packit c4476c
	lea	96($A_flat),$A_flat
Packit c4476c
	shr	\$3,$bsz
Packit c4476c
Packit c4476c
	vzeroupper
Packit c4476c
Packit c4476c
	vpbroadcastq	-96($A_flat),$A00
Packit c4476c
	vpxor		@T[0],@T[0],@T[0]
Packit c4476c
	vmovdqu		8+32*0-96($A_flat),$A01
Packit c4476c
	vmovdqu		8+32*1-96($A_flat),$A20
Packit c4476c
	vmovdqu		8+32*2-96($A_flat),$A31
Packit c4476c
	vmovdqu		8+32*3-96($A_flat),$A21
Packit c4476c
	vmovdqu		8+32*4-96($A_flat),$A41
Packit c4476c
	vmovdqu		8+32*5-96($A_flat),$A11
Packit c4476c
Packit c4476c
	mov	$bsz,%rax
Packit c4476c
Packit c4476c
.Loop_squeeze_avx2:
Packit c4476c
	mov	@A_jagged[$i]-96($A_flat),%r8
Packit c4476c
___
Packit c4476c
for (my $i=0; $i<25; $i++) {
Packit c4476c
$code.=<<___;
Packit c4476c
	sub	\$8,$len
Packit c4476c
	jc	.Ltail_squeeze_avx2
Packit c4476c
	mov	%r8,($out)
Packit c4476c
	lea	8($out),$out
Packit c4476c
	je	.Ldone_squeeze_avx2
Packit c4476c
	dec	%eax
Packit c4476c
	je	.Lextend_output_avx2
Packit c4476c
	mov	@A_jagged[$i+1]-120($A_flat),%r8
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
.Lextend_output_avx2:
Packit c4476c
	call	__KeccakF1600
Packit c4476c
Packit c4476c
	vmovq	%xmm0,-96($A_flat)
Packit c4476c
	vmovdqu	$A01,8+32*0-96($A_flat)
Packit c4476c
	vmovdqu	$A20,8+32*1-96($A_flat)
Packit c4476c
	vmovdqu	$A31,8+32*2-96($A_flat)
Packit c4476c
	vmovdqu	$A21,8+32*3-96($A_flat)
Packit c4476c
	vmovdqu	$A41,8+32*4-96($A_flat)
Packit c4476c
	vmovdqu	$A11,8+32*5-96($A_flat)
Packit c4476c
Packit c4476c
	mov	$bsz,%rax
Packit c4476c
	jmp	.Loop_squeeze_avx2
Packit c4476c
Packit c4476c
Packit c4476c
.Ltail_squeeze_avx2:
Packit c4476c
	add	\$8,$len
Packit c4476c
.Loop_tail_avx2:
Packit c4476c
	mov	%r8b,($out)
Packit c4476c
	lea	1($out),$out
Packit c4476c
	shr	\$8,%r8
Packit c4476c
	dec	$len
Packit c4476c
	jnz	.Loop_tail_avx2
Packit c4476c
Packit c4476c
.Ldone_squeeze_avx2:
Packit c4476c
	vzeroupper
Packit c4476c
Packit c4476c
	lea	(%r11),%rsp
Packit c4476c
	ret
Packit c4476c
.size	SHA3_squeeze,.-SHA3_squeeze
Packit c4476c
Packit c4476c
.align	64
Packit c4476c
rhotates_left:
Packit c4476c
	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
Packit c4476c
	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
Packit c4476c
	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
Packit c4476c
	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
Packit c4476c
	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
Packit c4476c
	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
Packit c4476c
rhotates_right:
Packit c4476c
	.quad	64-3,	64-18,	64-36,	64-41
Packit c4476c
	.quad	64-1,	64-62,	64-28,	64-27
Packit c4476c
	.quad	64-45,	64-6,	64-56,	64-39
Packit c4476c
	.quad	64-10,	64-61,	64-55,	64-8
Packit c4476c
	.quad	64-2,	64-15,	64-25,	64-20
Packit c4476c
	.quad	64-44,	64-43,	64-21,	64-14
Packit c4476c
iotas:
Packit c4476c
	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
Packit c4476c
	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
Packit c4476c
	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
Packit c4476c
	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
Packit c4476c
	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
Packit c4476c
	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
Packit c4476c
	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
Packit c4476c
	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
Packit c4476c
	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
Packit c4476c
	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
Packit c4476c
	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
Packit c4476c
	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
Packit c4476c
	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
Packit c4476c
	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
Packit c4476c
	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
Packit c4476c
	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
Packit c4476c
	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
Packit c4476c
	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
Packit c4476c
	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
Packit c4476c
	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
Packit c4476c
	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
Packit c4476c
	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
Packit c4476c
	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
Packit c4476c
	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
Packit c4476c
Packit c4476c
.asciz	"Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
___
Packit c4476c
Packit c4476c
$output=pop;
Packit c4476c
open STDOUT,">$output";
Packit c4476c
print $code;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";