Blame crypto/sha/asm/keccak1600-x86_64.pl

Packit c4476c
#!/usr/bin/env perl
Packit c4476c
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# Keccak-1600 for x86_64.
Packit c4476c
#
Packit c4476c
# June 2017.
Packit c4476c
#
Packit c4476c
# Below code is [lane complementing] KECCAK_2X implementation (see
Packit c4476c
# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
Packit c4476c
# instead of actually unrolling the loop pair-wise I simply flip
Packit c4476c
# pointers to T[][] and A[][] at the end of round. Since number of
Packit c4476c
# rounds is even, last round writes to A[][] and everything works out.
Packit c4476c
# How does it compare to x86_64 assembly module in Keccak Code Package?
Packit c4476c
# Depending on processor it's either as fast or faster by up to 15%...
Packit c4476c
#
Packit c4476c
########################################################################
Packit c4476c
# Numbers are cycles per processed byte out of large message.
Packit c4476c
#
Packit c4476c
#			r=1088(*)
Packit c4476c
#
Packit c4476c
# P4			25.8
Packit c4476c
# Core 2		12.9
Packit c4476c
# Westmere		13.7
Packit c4476c
# Sandy Bridge		12.9(**)
Packit c4476c
# Haswell		9.6
Packit c4476c
# Skylake		9.4
Packit c4476c
# Silvermont		22.8
Packit c4476c
# Goldmont		15.8
Packit c4476c
# VIA Nano		17.3
Packit c4476c
# Sledgehammer		13.3
Packit c4476c
# Bulldozer		16.5
Packit c4476c
# Ryzen			8.8
Packit c4476c
#
Packit c4476c
# (*)	Corresponds to SHA3-256. Improvement over compiler-generate
Packit c4476c
#	varies a lot, most common coefficient is 15% in comparison to
Packit c4476c
#	gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
Packit c4476c
# (**)	Sandy Bridge has broken rotate instruction. Performance can be
Packit c4476c
#	improved by 14% by replacing rotates with double-precision
Packit c4476c
#	shift with same register as source and destination.
Packit c4476c
Packit c4476c
$flavour = shift;
Packit c4476c
$output  = shift;
Packit c4476c
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
Packit c4476c
Packit c4476c
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Packit c4476c
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Packit c4476c
die "can't locate x86_64-xlate.pl";
Packit c4476c
Packit c4476c
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Packit c4476c
*STDOUT=*OUT;
Packit c4476c
Packit c4476c
my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
Packit c4476c
              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
Packit c4476c
Packit c4476c
my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
Packit c4476c
my @D = map("%r$_",(8..12));
Packit c4476c
my @T = map("%r$_",(13..14));
Packit c4476c
my $iotas = "%r15";
Packit c4476c
Packit c4476c
my @rhotates = ([  0,  1, 62, 28, 27 ],
Packit c4476c
                [ 36, 44,  6, 55, 20 ],
Packit c4476c
                [  3, 10, 43, 25, 39 ],
Packit c4476c
                [ 41, 45, 15, 21,  8 ],
Packit c4476c
                [ 18,  2, 61, 56, 14 ]);
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.text
Packit c4476c
Packit c4476c
.type	__KeccakF1600,\@abi-omnipotent
Packit c4476c
.align	32
Packit c4476c
__KeccakF1600:
Packit c4476c
.cfi_startproc
Packit c4476c
	mov	$A[4][0](%rdi),@C[0]
Packit c4476c
	mov	$A[4][1](%rdi),@C[1]
Packit c4476c
	mov	$A[4][2](%rdi),@C[2]
Packit c4476c
	mov	$A[4][3](%rdi),@C[3]
Packit c4476c
	mov	$A[4][4](%rdi),@C[4]
Packit c4476c
	jmp	.Loop
Packit c4476c
Packit c4476c
.align	32
Packit c4476c
.Loop:
Packit c4476c
	mov	$A[0][0](%rdi),@D[0]
Packit c4476c
	mov	$A[1][1](%rdi),@D[1]
Packit c4476c
	mov	$A[2][2](%rdi),@D[2]
Packit c4476c
	mov	$A[3][3](%rdi),@D[3]
Packit c4476c
Packit c4476c
	xor	$A[0][2](%rdi),@C[2]
Packit c4476c
	xor	$A[0][3](%rdi),@C[3]
Packit c4476c
	xor	@D[0],         @C[0]
Packit c4476c
	xor	$A[0][1](%rdi),@C[1]
Packit c4476c
	 xor	$A[1][2](%rdi),@C[2]
Packit c4476c
	 xor	$A[1][0](%rdi),@C[0]
Packit c4476c
	mov	@C[4],@D[4]
Packit c4476c
	xor	$A[0][4](%rdi),@C[4]
Packit c4476c
Packit c4476c
	xor	@D[2],         @C[2]
Packit c4476c
	xor	$A[2][0](%rdi),@C[0]
Packit c4476c
	 xor	$A[1][3](%rdi),@C[3]
Packit c4476c
	 xor	@D[1],         @C[1]
Packit c4476c
	 xor	$A[1][4](%rdi),@C[4]
Packit c4476c
Packit c4476c
	xor	$A[3][2](%rdi),@C[2]
Packit c4476c
	xor	$A[3][0](%rdi),@C[0]
Packit c4476c
	 xor	$A[2][3](%rdi),@C[3]
Packit c4476c
	 xor	$A[2][1](%rdi),@C[1]
Packit c4476c
	 xor	$A[2][4](%rdi),@C[4]
Packit c4476c
Packit c4476c
	mov	@C[2],@T[0]
Packit c4476c
	rol	\$1,@C[2]
Packit c4476c
	xor	@C[0],@C[2]		# D[1] = ROL64(C[2], 1) ^ C[0]
Packit c4476c
	 xor	@D[3],         @C[3]
Packit c4476c
Packit c4476c
	rol	\$1,@C[0]
Packit c4476c
	xor	@C[3],@C[0]		# D[4] = ROL64(C[0], 1) ^ C[3]
Packit c4476c
	 xor	$A[3][1](%rdi),@C[1]
Packit c4476c
Packit c4476c
	rol	\$1,@C[3]
Packit c4476c
	xor	@C[1],@C[3]		# D[2] = ROL64(C[3], 1) ^ C[1]
Packit c4476c
	 xor	$A[3][4](%rdi),@C[4]
Packit c4476c
Packit c4476c
	rol	\$1,@C[1]
Packit c4476c
	xor	@C[4],@C[1]		# D[0] = ROL64(C[1], 1) ^ C[4]
Packit c4476c
Packit c4476c
	rol	\$1,@C[4]
Packit c4476c
	xor	@T[0],@C[4]		# D[3] = ROL64(C[4], 1) ^ C[2]
Packit c4476c
___
Packit c4476c
	(@D[0..4], @C) = (@C[1..4,0], @D);
Packit c4476c
$code.=<<___;
Packit c4476c
	xor	@D[1],@C[1]
Packit c4476c
	xor	@D[2],@C[2]
Packit c4476c
	rol	\$$rhotates[1][1],@C[1]
Packit c4476c
	xor	@D[3],@C[3]
Packit c4476c
	xor	@D[4],@C[4]
Packit c4476c
	rol	\$$rhotates[2][2],@C[2]
Packit c4476c
	xor	@D[0],@C[0]
Packit c4476c
	 mov	@C[1],@T[0]
Packit c4476c
	rol	\$$rhotates[3][3],@C[3]
Packit c4476c
	 or	@C[2],@C[1]
Packit c4476c
	 xor	@C[0],@C[1]		#           C[0] ^ ( C[1] | C[2])
Packit c4476c
	rol	\$$rhotates[4][4],@C[4]
Packit c4476c
Packit c4476c
	 xor	($iotas),@C[1]
Packit c4476c
	 lea	8($iotas),$iotas
Packit c4476c
Packit c4476c
	mov	@C[4],@T[1]
Packit c4476c
	and	@C[3],@C[4]
Packit c4476c
	 mov	@C[1],$A[0][0](%rsi)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
Packit c4476c
	xor	@C[2],@C[4]		#           C[2] ^ ( C[4] & C[3])
Packit c4476c
	not	@C[2]
Packit c4476c
	mov	@C[4],$A[0][2](%rsi)	# R[0][2] = C[2] ^ ( C[4] & C[3])
Packit c4476c
Packit c4476c
	or	@C[3],@C[2]
Packit c4476c
	  mov	$A[4][2](%rdi),@C[4]
Packit c4476c
	xor	@T[0],@C[2]		#           C[1] ^ (~C[2] | C[3])
Packit c4476c
	mov	@C[2],$A[0][1](%rsi)	# R[0][1] = C[1] ^ (~C[2] | C[3])
Packit c4476c
Packit c4476c
	and	@C[0],@T[0]
Packit c4476c
	  mov	$A[1][4](%rdi),@C[1]
Packit c4476c
	xor	@T[1],@T[0]		#           C[4] ^ ( C[1] & C[0])
Packit c4476c
	  mov	$A[2][0](%rdi),@C[2]
Packit c4476c
	mov	@T[0],$A[0][4](%rsi)	# R[0][4] = C[4] ^ ( C[1] & C[0])
Packit c4476c
Packit c4476c
	or	@C[0],@T[1]
Packit c4476c
	  mov	$A[0][3](%rdi),@C[0]
Packit c4476c
	xor	@C[3],@T[1]		#           C[3] ^ ( C[4] | C[0])
Packit c4476c
	  mov	$A[3][1](%rdi),@C[3]
Packit c4476c
	mov	@T[1],$A[0][3](%rsi)	# R[0][3] = C[3] ^ ( C[4] | C[0])
Packit c4476c
Packit c4476c
Packit c4476c
	xor	@D[3],@C[0]
Packit c4476c
	xor	@D[2],@C[4]
Packit c4476c
	rol	\$$rhotates[0][3],@C[0]
Packit c4476c
	xor	@D[1],@C[3]
Packit c4476c
	xor	@D[4],@C[1]
Packit c4476c
	rol	\$$rhotates[4][2],@C[4]
Packit c4476c
	rol	\$$rhotates[3][1],@C[3]
Packit c4476c
	xor	@D[0],@C[2]
Packit c4476c
	rol	\$$rhotates[1][4],@C[1]
Packit c4476c
	 mov	@C[0],@T[0]
Packit c4476c
	 or	@C[4],@C[0]
Packit c4476c
	rol	\$$rhotates[2][0],@C[2]
Packit c4476c
Packit c4476c
	xor	@C[3],@C[0]		#           C[3] ^ (C[0] |  C[4])
Packit c4476c
	mov	@C[0],$A[1][3](%rsi)	# R[1][3] = C[3] ^ (C[0] |  C[4])
Packit c4476c
Packit c4476c
	mov	@C[1],@T[1]
Packit c4476c
	and	@T[0],@C[1]
Packit c4476c
	  mov	$A[0][1](%rdi),@C[0]
Packit c4476c
	xor	@C[4],@C[1]		#           C[4] ^ (C[1] &  C[0])
Packit c4476c
	not	@C[4]
Packit c4476c
	mov	@C[1],$A[1][4](%rsi)	# R[1][4] = C[4] ^ (C[1] &  C[0])
Packit c4476c
Packit c4476c
	or	@C[3],@C[4]
Packit c4476c
	  mov	$A[1][2](%rdi),@C[1]
Packit c4476c
	xor	@C[2],@C[4]		#           C[2] ^ (~C[4] | C[3])
Packit c4476c
	mov	@C[4],$A[1][2](%rsi)	# R[1][2] = C[2] ^ (~C[4] | C[3])
Packit c4476c
Packit c4476c
	and	@C[2],@C[3]
Packit c4476c
	  mov	$A[4][0](%rdi),@C[4]
Packit c4476c
	xor	@T[1],@C[3]		#           C[1] ^ (C[3] &  C[2])
Packit c4476c
	mov	@C[3],$A[1][1](%rsi)	# R[1][1] = C[1] ^ (C[3] &  C[2])
Packit c4476c
Packit c4476c
	or	@C[2],@T[1]
Packit c4476c
	  mov	$A[2][3](%rdi),@C[2]
Packit c4476c
	xor	@T[0],@T[1]		#           C[0] ^ (C[1] |  C[2])
Packit c4476c
	  mov	$A[3][4](%rdi),@C[3]
Packit c4476c
	mov	@T[1],$A[1][0](%rsi)	# R[1][0] = C[0] ^ (C[1] |  C[2])
Packit c4476c
Packit c4476c
Packit c4476c
	xor	@D[3],@C[2]
Packit c4476c
	xor	@D[4],@C[3]
Packit c4476c
	rol	\$$rhotates[2][3],@C[2]
Packit c4476c
	xor	@D[2],@C[1]
Packit c4476c
	rol	\$$rhotates[3][4],@C[3]
Packit c4476c
	xor	@D[0],@C[4]
Packit c4476c
	rol	\$$rhotates[1][2],@C[1]
Packit c4476c
	xor	@D[1],@C[0]
Packit c4476c
	rol	\$$rhotates[4][0],@C[4]
Packit c4476c
	 mov	@C[2],@T[0]
Packit c4476c
	 and	@C[3],@C[2]
Packit c4476c
	rol	\$$rhotates[0][1],@C[0]
Packit c4476c
Packit c4476c
	not	@C[3]
Packit c4476c
	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] & C[3])
Packit c4476c
	mov	@C[2],$A[2][1](%rsi)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
Packit c4476c
Packit c4476c
	mov	@C[4],@T[1]
Packit c4476c
	and	@C[3],@C[4]
Packit c4476c
	  mov	$A[2][1](%rdi),@C[2]
Packit c4476c
	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] & ~C[3])
Packit c4476c
	mov	@C[4],$A[2][2](%rsi)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
Packit c4476c
Packit c4476c
	or	@C[1],@T[0]
Packit c4476c
	  mov	$A[4][3](%rdi),@C[4]
Packit c4476c
	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] | C[1])
Packit c4476c
	mov	@T[0],$A[2][0](%rsi)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
Packit c4476c
Packit c4476c
	and	@C[0],@C[1]
Packit c4476c
	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] & C[0])
Packit c4476c
	mov	@C[1],$A[2][4](%rsi)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
Packit c4476c
Packit c4476c
	or	@C[0],@T[1]
Packit c4476c
	  mov	$A[1][0](%rdi),@C[1]
Packit c4476c
	xor	@C[3],@T[1]		#           ~C[3] ^ ( C[0] | C[4])
Packit c4476c
	  mov	$A[3][2](%rdi),@C[3]
Packit c4476c
	mov	@T[1],$A[2][3](%rsi)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
Packit c4476c
Packit c4476c
Packit c4476c
	mov	$A[0][4](%rdi),@C[0]
Packit c4476c
Packit c4476c
	xor	@D[1],@C[2]
Packit c4476c
	xor	@D[2],@C[3]
Packit c4476c
	rol	\$$rhotates[2][1],@C[2]
Packit c4476c
	xor	@D[0],@C[1]
Packit c4476c
	rol	\$$rhotates[3][2],@C[3]
Packit c4476c
	xor	@D[3],@C[4]
Packit c4476c
	rol	\$$rhotates[1][0],@C[1]
Packit c4476c
	xor	@D[4],@C[0]
Packit c4476c
	rol	\$$rhotates[4][3],@C[4]
Packit c4476c
	 mov	@C[2],@T[0]
Packit c4476c
	 or	@C[3],@C[2]
Packit c4476c
	rol	\$$rhotates[0][4],@C[0]
Packit c4476c
Packit c4476c
	not	@C[3]
Packit c4476c
	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] | C[3])
Packit c4476c
	mov	@C[2],$A[3][1](%rsi)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
Packit c4476c
Packit c4476c
	mov	@C[4],@T[1]
Packit c4476c
	or	@C[3],@C[4]
Packit c4476c
	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] | ~C[3])
Packit c4476c
	mov	@C[4],$A[3][2](%rsi)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
Packit c4476c
Packit c4476c
	and	@C[1],@T[0]
Packit c4476c
	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] & C[1])
Packit c4476c
	mov	@T[0],$A[3][0](%rsi)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
Packit c4476c
Packit c4476c
	or	@C[0],@C[1]
Packit c4476c
	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] | C[0])
Packit c4476c
	mov	@C[1],$A[3][4](%rsi)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
Packit c4476c
Packit c4476c
	and	@T[1],@C[0]
Packit c4476c
	xor	@C[3],@C[0]		#           ~C[3] ^ ( C[0] & C[4])
Packit c4476c
	mov	@C[0],$A[3][3](%rsi)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
Packit c4476c
Packit c4476c
Packit c4476c
	xor	$A[0][2](%rdi),@D[2]
Packit c4476c
	xor	$A[1][3](%rdi),@D[3]
Packit c4476c
	rol	\$$rhotates[0][2],@D[2]
Packit c4476c
	xor	$A[4][1](%rdi),@D[1]
Packit c4476c
	rol	\$$rhotates[1][3],@D[3]
Packit c4476c
	xor	$A[2][4](%rdi),@D[4]
Packit c4476c
	rol	\$$rhotates[4][1],@D[1]
Packit c4476c
	xor	$A[3][0](%rdi),@D[0]
Packit c4476c
	xchg	%rsi,%rdi
Packit c4476c
	rol	\$$rhotates[2][4],@D[4]
Packit c4476c
	rol	\$$rhotates[3][0],@D[0]
Packit c4476c
___
Packit c4476c
	@C = @D[2..4,0,1];
Packit c4476c
$code.=<<___;
Packit c4476c
	mov	@C[0],@T[0]
Packit c4476c
	and	@C[1],@C[0]
Packit c4476c
	not	@C[1]
Packit c4476c
	xor	@C[4],@C[0]		#            C[4] ^ ( C[0] & C[1])
Packit c4476c
	mov	@C[0],$A[4][4](%rdi)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
Packit c4476c
Packit c4476c
	mov	@C[2],@T[1]
Packit c4476c
	and	@C[1],@C[2]
Packit c4476c
	xor	@T[0],@C[2]		#            C[0] ^ ( C[2] & ~C[1])
Packit c4476c
	mov	@C[2],$A[4][0](%rdi)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
Packit c4476c
Packit c4476c
	or	@C[4],@T[0]
Packit c4476c
	xor	@C[3],@T[0]		#            C[3] ^ ( C[0] | C[4])
Packit c4476c
	mov	@T[0],$A[4][3](%rdi)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
Packit c4476c
Packit c4476c
	and	@C[3],@C[4]
Packit c4476c
	xor	@T[1],@C[4]		#            C[2] ^ ( C[4] & C[3])
Packit c4476c
	mov	@C[4],$A[4][2](%rdi)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
Packit c4476c
Packit c4476c
	or	@T[1],@C[3]
Packit c4476c
	xor	@C[1],@C[3]		#           ~C[1] ^ ( C[2] | C[3])
Packit c4476c
	mov	@C[3],$A[4][1](%rdi)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
Packit c4476c
Packit c4476c
	mov	@C[0],@C[1]		# harmonize with the loop top
Packit c4476c
	mov	@T[0],@C[0]
Packit c4476c
Packit c4476c
	test	\$255,$iotas
Packit c4476c
	jnz	.Loop
Packit c4476c
Packit c4476c
	lea	-192($iotas),$iotas	# rewind iotas
Packit c4476c
	ret
Packit c4476c
.cfi_endproc
Packit c4476c
.size	__KeccakF1600,.-__KeccakF1600
Packit c4476c
Packit c4476c
.type	KeccakF1600,\@abi-omnipotent
Packit c4476c
.align	32
Packit c4476c
KeccakF1600:
Packit c4476c
.cfi_startproc
Packit c4476c
	push	%rbx
Packit c4476c
.cfi_push	%rbx
Packit c4476c
	push	%rbp
Packit c4476c
.cfi_push	%rbp
Packit c4476c
	push	%r12
Packit c4476c
.cfi_push	%r12
Packit c4476c
	push	%r13
Packit c4476c
.cfi_push	%r13
Packit c4476c
	push	%r14
Packit c4476c
.cfi_push	%r14
Packit c4476c
	push	%r15
Packit c4476c
.cfi_push	%r15
Packit c4476c
Packit c4476c
	lea	100(%rdi),%rdi		# size optimization
Packit c4476c
	sub	\$200,%rsp
Packit c4476c
.cfi_adjust_cfa_offset	200
Packit c4476c
Packit c4476c
	notq	$A[0][1](%rdi)
Packit c4476c
	notq	$A[0][2](%rdi)
Packit c4476c
	notq	$A[1][3](%rdi)
Packit c4476c
	notq	$A[2][2](%rdi)
Packit c4476c
	notq	$A[3][2](%rdi)
Packit c4476c
	notq	$A[4][0](%rdi)
Packit c4476c
Packit c4476c
	lea	iotas(%rip),$iotas
Packit c4476c
	lea	100(%rsp),%rsi		# size optimization
Packit c4476c
Packit c4476c
	call	__KeccakF1600
Packit c4476c
Packit c4476c
	notq	$A[0][1](%rdi)
Packit c4476c
	notq	$A[0][2](%rdi)
Packit c4476c
	notq	$A[1][3](%rdi)
Packit c4476c
	notq	$A[2][2](%rdi)
Packit c4476c
	notq	$A[3][2](%rdi)
Packit c4476c
	notq	$A[4][0](%rdi)
Packit c4476c
	lea	-100(%rdi),%rdi		# preserve A[][]
Packit c4476c
Packit c4476c
	add	\$200,%rsp
Packit c4476c
.cfi_adjust_cfa_offset	-200
Packit c4476c
Packit c4476c
	pop	%r15
Packit c4476c
.cfi_pop	%r15
Packit c4476c
	pop	%r14
Packit c4476c
.cfi_pop	%r14
Packit c4476c
	pop	%r13
Packit c4476c
.cfi_pop	%r13
Packit c4476c
	pop	%r12
Packit c4476c
.cfi_pop	%r12
Packit c4476c
	pop	%rbp
Packit c4476c
.cfi_pop	%rbp
Packit c4476c
	pop	%rbx
Packit c4476c
.cfi_pop	%rbx
Packit c4476c
	ret
Packit c4476c
.cfi_endproc
Packit c4476c
.size	KeccakF1600,.-KeccakF1600
Packit c4476c
___
Packit c4476c
Packit c4476c
{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
Packit c4476c
     ($A_flat,$inp) = ("%r8","%r9");
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_absorb
Packit c4476c
.type	SHA3_absorb,\@function,4
Packit c4476c
.align	32
Packit c4476c
SHA3_absorb:
Packit c4476c
.cfi_startproc
Packit c4476c
	push	%rbx
Packit c4476c
.cfi_push	%rbx
Packit c4476c
	push	%rbp
Packit c4476c
.cfi_push	%rbp
Packit c4476c
	push	%r12
Packit c4476c
.cfi_push	%r12
Packit c4476c
	push	%r13
Packit c4476c
.cfi_push	%r13
Packit c4476c
	push	%r14
Packit c4476c
.cfi_push	%r14
Packit c4476c
	push	%r15
Packit c4476c
.cfi_push	%r15
Packit c4476c
Packit c4476c
	lea	100(%rdi),%rdi		# size optimization
Packit c4476c
	sub	\$232,%rsp
Packit c4476c
.cfi_adjust_cfa_offset	232
Packit c4476c
Packit c4476c
	mov	%rsi,$inp
Packit c4476c
	lea	100(%rsp),%rsi		# size optimization
Packit c4476c
Packit c4476c
	notq	$A[0][1](%rdi)
Packit c4476c
	notq	$A[0][2](%rdi)
Packit c4476c
	notq	$A[1][3](%rdi)
Packit c4476c
	notq	$A[2][2](%rdi)
Packit c4476c
	notq	$A[3][2](%rdi)
Packit c4476c
	notq	$A[4][0](%rdi)
Packit c4476c
	lea	iotas(%rip),$iotas
Packit c4476c
Packit c4476c
	mov	$bsz,216-100(%rsi)	# save bsz
Packit c4476c
Packit c4476c
.Loop_absorb:
Packit c4476c
	cmp	$bsz,$len
Packit c4476c
	jc	.Ldone_absorb
Packit c4476c
Packit c4476c
	shr	\$3,$bsz
Packit c4476c
	lea	-100(%rdi),$A_flat
Packit c4476c
Packit c4476c
.Lblock_absorb:
Packit c4476c
	mov	($inp),%rax
Packit c4476c
	lea	8($inp),$inp
Packit c4476c
	xor	($A_flat),%rax
Packit c4476c
	lea	8($A_flat),$A_flat
Packit c4476c
	sub	\$8,$len
Packit c4476c
	mov	%rax,-8($A_flat)
Packit c4476c
	sub	\$1,$bsz
Packit c4476c
	jnz	.Lblock_absorb
Packit c4476c
Packit c4476c
	mov	$inp,200-100(%rsi)	# save inp
Packit c4476c
	mov	$len,208-100(%rsi)	# save len
Packit c4476c
	call	__KeccakF1600
Packit c4476c
	mov	200-100(%rsi),$inp	# pull inp
Packit c4476c
	mov	208-100(%rsi),$len	# pull len
Packit c4476c
	mov	216-100(%rsi),$bsz	# pull bsz
Packit c4476c
	jmp	.Loop_absorb
Packit c4476c
Packit c4476c
.align	32
Packit c4476c
.Ldone_absorb:
Packit c4476c
	mov	$len,%rax		# return value
Packit c4476c
Packit c4476c
	notq	$A[0][1](%rdi)
Packit c4476c
	notq	$A[0][2](%rdi)
Packit c4476c
	notq	$A[1][3](%rdi)
Packit c4476c
	notq	$A[2][2](%rdi)
Packit c4476c
	notq	$A[3][2](%rdi)
Packit c4476c
	notq	$A[4][0](%rdi)
Packit c4476c
Packit c4476c
	add	\$232,%rsp
Packit c4476c
.cfi_adjust_cfa_offset	-232
Packit c4476c
Packit c4476c
	pop	%r15
Packit c4476c
.cfi_pop	%r15
Packit c4476c
	pop	%r14
Packit c4476c
.cfi_pop	%r14
Packit c4476c
	pop	%r13
Packit c4476c
.cfi_pop	%r13
Packit c4476c
	pop	%r12
Packit c4476c
.cfi_pop	%r12
Packit c4476c
	pop	%rbp
Packit c4476c
.cfi_pop	%rbp
Packit c4476c
	pop	%rbx
Packit c4476c
.cfi_pop	%rbx
Packit c4476c
	ret
Packit c4476c
.cfi_endproc
Packit c4476c
.size	SHA3_absorb,.-SHA3_absorb
Packit c4476c
___
Packit c4476c
}
Packit c4476c
{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
Packit c4476c
     ($out,$len,$bsz) = ("%r12","%r13","%r14");
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_squeeze
Packit c4476c
.type	SHA3_squeeze,\@function,4
Packit c4476c
.align	32
Packit c4476c
SHA3_squeeze:
Packit c4476c
.cfi_startproc
Packit c4476c
	push	%r12
Packit c4476c
.cfi_push	%r12
Packit c4476c
	push	%r13
Packit c4476c
.cfi_push	%r13
Packit c4476c
	push	%r14
Packit c4476c
.cfi_push	%r14
Packit c4476c
Packit c4476c
	shr	\$3,%rcx
Packit c4476c
	mov	$A_flat,%r8
Packit c4476c
	mov	%rsi,$out
Packit c4476c
	mov	%rdx,$len
Packit c4476c
	mov	%rcx,$bsz
Packit c4476c
	jmp	.Loop_squeeze
Packit c4476c
Packit c4476c
.align	32
Packit c4476c
.Loop_squeeze:
Packit c4476c
	cmp	\$8,$len
Packit c4476c
	jb	.Ltail_squeeze
Packit c4476c
Packit c4476c
	mov	(%r8),%rax
Packit c4476c
	lea	8(%r8),%r8
Packit c4476c
	mov	%rax,($out)
Packit c4476c
	lea	8($out),$out
Packit c4476c
	sub	\$8,$len		# len -= 8
Packit c4476c
	jz	.Ldone_squeeze
Packit c4476c
Packit c4476c
	sub	\$1,%rcx		# bsz--
Packit c4476c
	jnz	.Loop_squeeze
Packit c4476c
Packit c4476c
	call	KeccakF1600
Packit c4476c
	mov	$A_flat,%r8
Packit c4476c
	mov	$bsz,%rcx
Packit c4476c
	jmp	.Loop_squeeze
Packit c4476c
Packit c4476c
.Ltail_squeeze:
Packit c4476c
	mov	%r8, %rsi
Packit c4476c
	mov	$out,%rdi
Packit c4476c
	mov	$len,%rcx
Packit c4476c
	.byte	0xf3,0xa4		# rep	movsb
Packit c4476c
Packit c4476c
.Ldone_squeeze:
Packit c4476c
	pop	%r14
Packit c4476c
.cfi_pop	%r14
Packit c4476c
	pop	%r13
Packit c4476c
.cfi_pop	%r13
Packit c4476c
	pop	%r12
Packit c4476c
.cfi_pop	%r13
Packit c4476c
	ret
Packit c4476c
.cfi_endproc
Packit c4476c
.size	SHA3_squeeze,.-SHA3_squeeze
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
.align	256
Packit c4476c
	.quad	0,0,0,0,0,0,0,0
Packit c4476c
.type	iotas,\@object
Packit c4476c
iotas:
Packit c4476c
	.quad	0x0000000000000001
Packit c4476c
	.quad	0x0000000000008082
Packit c4476c
	.quad	0x800000000000808a
Packit c4476c
	.quad	0x8000000080008000
Packit c4476c
	.quad	0x000000000000808b
Packit c4476c
	.quad	0x0000000080000001
Packit c4476c
	.quad	0x8000000080008081
Packit c4476c
	.quad	0x8000000000008009
Packit c4476c
	.quad	0x000000000000008a
Packit c4476c
	.quad	0x0000000000000088
Packit c4476c
	.quad	0x0000000080008009
Packit c4476c
	.quad	0x000000008000000a
Packit c4476c
	.quad	0x000000008000808b
Packit c4476c
	.quad	0x800000000000008b
Packit c4476c
	.quad	0x8000000000008089
Packit c4476c
	.quad	0x8000000000008003
Packit c4476c
	.quad	0x8000000000008002
Packit c4476c
	.quad	0x8000000000000080
Packit c4476c
	.quad	0x000000000000800a
Packit c4476c
	.quad	0x800000008000000a
Packit c4476c
	.quad	0x8000000080008081
Packit c4476c
	.quad	0x8000000000008080
Packit c4476c
	.quad	0x0000000080000001
Packit c4476c
	.quad	0x8000000080008008
Packit c4476c
.size	iotas,.-iotas
Packit c4476c
.asciz	"Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
___
Packit c4476c
Packit c4476c
foreach (split("\n",$code)) {
Packit c4476c
	# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
Packit c4476c
	# Haswell, but it hurts other processors by up to 2-3-4x...
Packit c4476c
	#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
Packit c4476c
	# Below replacement results in 9.3 on Haswell [as well as
Packit c4476c
	# on Ryzen, i.e. it *hurts* Ryzen]...
Packit c4476c
	#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
Packit c4476c
Packit c4476c
	print $_, "\n";
Packit c4476c
}
Packit c4476c
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";