Blame crypto/sha/asm/keccak1600-x86_64.pl

Packit Service 084de1
#!/usr/bin/env perl
Packit Service 084de1
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit Service 084de1
#
Packit Service 084de1
# Licensed under the OpenSSL license (the "License").  You may not use
Packit Service 084de1
# this file except in compliance with the License.  You can obtain a copy
Packit Service 084de1
# in the file LICENSE in the source distribution or at
Packit Service 084de1
# https://www.openssl.org/source/license.html
Packit Service 084de1
#
Packit Service 084de1
# ====================================================================
Packit Service 084de1
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit Service 084de1
# project. The module is, however, dual licensed under OpenSSL and
Packit Service 084de1
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit Service 084de1
# details see http://www.openssl.org/~appro/cryptogams/.
Packit Service 084de1
# ====================================================================
Packit Service 084de1
#
Packit Service 084de1
# Keccak-1600 for x86_64.
Packit Service 084de1
#
Packit Service 084de1
# June 2017.
Packit Service 084de1
#
Packit Service 084de1
# Below code is [lane complementing] KECCAK_2X implementation (see
Packit Service 084de1
# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
Packit Service 084de1
# instead of actually unrolling the loop pair-wise I simply flip
Packit Service 084de1
# pointers to T[][] and A[][] at the end of round. Since number of
Packit Service 084de1
# rounds is even, last round writes to A[][] and everything works out.
Packit Service 084de1
# How does it compare to x86_64 assembly module in Keccak Code Package?
Packit Service 084de1
# Depending on processor it's either as fast or faster by up to 15%...
Packit Service 084de1
#
Packit Service 084de1
########################################################################
Packit Service 084de1
# Numbers are cycles per processed byte out of large message.
Packit Service 084de1
#
Packit Service 084de1
#			r=1088(*)
Packit Service 084de1
#
Packit Service 084de1
# P4			25.8
Packit Service 084de1
# Core 2		12.9
Packit Service 084de1
# Westmere		13.7
Packit Service 084de1
# Sandy Bridge		12.9(**)
Packit Service 084de1
# Haswell		9.6
Packit Service 084de1
# Skylake		9.4
Packit Service 084de1
# Silvermont		22.8
Packit Service 084de1
# Goldmont		15.8
Packit Service 084de1
# VIA Nano		17.3
Packit Service 084de1
# Sledgehammer		13.3
Packit Service 084de1
# Bulldozer		16.5
Packit Service 084de1
# Ryzen			8.8
Packit Service 084de1
#
Packit Service 084de1
# (*)	Corresponds to SHA3-256. Improvement over compiler-generate
Packit Service 084de1
#	varies a lot, most common coefficient is 15% in comparison to
Packit Service 084de1
#	gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
Packit Service 084de1
# (**)	Sandy Bridge has broken rotate instruction. Performance can be
Packit Service 084de1
#	improved by 14% by replacing rotates with double-precision
Packit Service 084de1
#	shift with same register as source and destination.
Packit Service 084de1
Packit Service 084de1
$flavour = shift;
Packit Service 084de1
$output  = shift;
Packit Service 084de1
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
Packit Service 084de1
Packit Service 084de1
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
Packit Service 084de1
Packit Service 084de1
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit Service 084de1
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Packit Service 084de1
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Packit Service 084de1
die "can't locate x86_64-xlate.pl";
Packit Service 084de1
Packit Service 084de1
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Packit Service 084de1
*STDOUT=*OUT;
Packit Service 084de1
Packit Service 084de1
my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
Packit Service 084de1
              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
Packit Service 084de1
Packit Service 084de1
my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
Packit Service 084de1
my @D = map("%r$_",(8..12));
Packit Service 084de1
my @T = map("%r$_",(13..14));
Packit Service 084de1
my $iotas = "%r15";
Packit Service 084de1
Packit Service 084de1
my @rhotates = ([  0,  1, 62, 28, 27 ],
Packit Service 084de1
                [ 36, 44,  6, 55, 20 ],
Packit Service 084de1
                [  3, 10, 43, 25, 39 ],
Packit Service 084de1
                [ 41, 45, 15, 21,  8 ],
Packit Service 084de1
                [ 18,  2, 61, 56, 14 ]);
Packit Service 084de1
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.text
Packit Service 084de1
Packit Service 084de1
.type	__KeccakF1600,\@abi-omnipotent
Packit Service 084de1
.align	32
Packit Service 084de1
__KeccakF1600:
Packit Service 084de1
.cfi_startproc
Packit Service 084de1
	mov	$A[4][0](%rdi),@C[0]
Packit Service 084de1
	mov	$A[4][1](%rdi),@C[1]
Packit Service 084de1
	mov	$A[4][2](%rdi),@C[2]
Packit Service 084de1
	mov	$A[4][3](%rdi),@C[3]
Packit Service 084de1
	mov	$A[4][4](%rdi),@C[4]
Packit Service 084de1
	jmp	.Loop
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Loop:
Packit Service 084de1
	mov	$A[0][0](%rdi),@D[0]
Packit Service 084de1
	mov	$A[1][1](%rdi),@D[1]
Packit Service 084de1
	mov	$A[2][2](%rdi),@D[2]
Packit Service 084de1
	mov	$A[3][3](%rdi),@D[3]
Packit Service 084de1
Packit Service 084de1
	xor	$A[0][2](%rdi),@C[2]
Packit Service 084de1
	xor	$A[0][3](%rdi),@C[3]
Packit Service 084de1
	xor	@D[0],         @C[0]
Packit Service 084de1
	xor	$A[0][1](%rdi),@C[1]
Packit Service 084de1
	 xor	$A[1][2](%rdi),@C[2]
Packit Service 084de1
	 xor	$A[1][0](%rdi),@C[0]
Packit Service 084de1
	mov	@C[4],@D[4]
Packit Service 084de1
	xor	$A[0][4](%rdi),@C[4]
Packit Service 084de1
Packit Service 084de1
	xor	@D[2],         @C[2]
Packit Service 084de1
	xor	$A[2][0](%rdi),@C[0]
Packit Service 084de1
	 xor	$A[1][3](%rdi),@C[3]
Packit Service 084de1
	 xor	@D[1],         @C[1]
Packit Service 084de1
	 xor	$A[1][4](%rdi),@C[4]
Packit Service 084de1
Packit Service 084de1
	xor	$A[3][2](%rdi),@C[2]
Packit Service 084de1
	xor	$A[3][0](%rdi),@C[0]
Packit Service 084de1
	 xor	$A[2][3](%rdi),@C[3]
Packit Service 084de1
	 xor	$A[2][1](%rdi),@C[1]
Packit Service 084de1
	 xor	$A[2][4](%rdi),@C[4]
Packit Service 084de1
Packit Service 084de1
	mov	@C[2],@T[0]
Packit Service 084de1
	rol	\$1,@C[2]
Packit Service 084de1
	xor	@C[0],@C[2]		# D[1] = ROL64(C[2], 1) ^ C[0]
Packit Service 084de1
	 xor	@D[3],         @C[3]
Packit Service 084de1
Packit Service 084de1
	rol	\$1,@C[0]
Packit Service 084de1
	xor	@C[3],@C[0]		# D[4] = ROL64(C[0], 1) ^ C[3]
Packit Service 084de1
	 xor	$A[3][1](%rdi),@C[1]
Packit Service 084de1
Packit Service 084de1
	rol	\$1,@C[3]
Packit Service 084de1
	xor	@C[1],@C[3]		# D[2] = ROL64(C[3], 1) ^ C[1]
Packit Service 084de1
	 xor	$A[3][4](%rdi),@C[4]
Packit Service 084de1
Packit Service 084de1
	rol	\$1,@C[1]
Packit Service 084de1
	xor	@C[4],@C[1]		# D[0] = ROL64(C[1], 1) ^ C[4]
Packit Service 084de1
Packit Service 084de1
	rol	\$1,@C[4]
Packit Service 084de1
	xor	@T[0],@C[4]		# D[3] = ROL64(C[4], 1) ^ C[2]
Packit Service 084de1
___
Packit Service 084de1
	(@D[0..4], @C) = (@C[1..4,0], @D);
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
	xor	@D[1],@C[1]
Packit Service 084de1
	xor	@D[2],@C[2]
Packit Service 084de1
	rol	\$$rhotates[1][1],@C[1]
Packit Service 084de1
	xor	@D[3],@C[3]
Packit Service 084de1
	xor	@D[4],@C[4]
Packit Service 084de1
	rol	\$$rhotates[2][2],@C[2]
Packit Service 084de1
	xor	@D[0],@C[0]
Packit Service 084de1
	 mov	@C[1],@T[0]
Packit Service 084de1
	rol	\$$rhotates[3][3],@C[3]
Packit Service 084de1
	 or	@C[2],@C[1]
Packit Service 084de1
	 xor	@C[0],@C[1]		#           C[0] ^ ( C[1] | C[2])
Packit Service 084de1
	rol	\$$rhotates[4][4],@C[4]
Packit Service 084de1
Packit Service 084de1
	 xor	($iotas),@C[1]
Packit Service 084de1
	 lea	8($iotas),$iotas
Packit Service 084de1
Packit Service 084de1
	mov	@C[4],@T[1]
Packit Service 084de1
	and	@C[3],@C[4]
Packit Service 084de1
	 mov	@C[1],$A[0][0](%rsi)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
Packit Service 084de1
	xor	@C[2],@C[4]		#           C[2] ^ ( C[4] & C[3])
Packit Service 084de1
	not	@C[2]
Packit Service 084de1
	mov	@C[4],$A[0][2](%rsi)	# R[0][2] = C[2] ^ ( C[4] & C[3])
Packit Service 084de1
Packit Service 084de1
	or	@C[3],@C[2]
Packit Service 084de1
	  mov	$A[4][2](%rdi),@C[4]
Packit Service 084de1
	xor	@T[0],@C[2]		#           C[1] ^ (~C[2] | C[3])
Packit Service 084de1
	mov	@C[2],$A[0][1](%rsi)	# R[0][1] = C[1] ^ (~C[2] | C[3])
Packit Service 084de1
Packit Service 084de1
	and	@C[0],@T[0]
Packit Service 084de1
	  mov	$A[1][4](%rdi),@C[1]
Packit Service 084de1
	xor	@T[1],@T[0]		#           C[4] ^ ( C[1] & C[0])
Packit Service 084de1
	  mov	$A[2][0](%rdi),@C[2]
Packit Service 084de1
	mov	@T[0],$A[0][4](%rsi)	# R[0][4] = C[4] ^ ( C[1] & C[0])
Packit Service 084de1
Packit Service 084de1
	or	@C[0],@T[1]
Packit Service 084de1
	  mov	$A[0][3](%rdi),@C[0]
Packit Service 084de1
	xor	@C[3],@T[1]		#           C[3] ^ ( C[4] | C[0])
Packit Service 084de1
	  mov	$A[3][1](%rdi),@C[3]
Packit Service 084de1
	mov	@T[1],$A[0][3](%rsi)	# R[0][3] = C[3] ^ ( C[4] | C[0])
Packit Service 084de1
Packit Service 084de1
Packit Service 084de1
	xor	@D[3],@C[0]
Packit Service 084de1
	xor	@D[2],@C[4]
Packit Service 084de1
	rol	\$$rhotates[0][3],@C[0]
Packit Service 084de1
	xor	@D[1],@C[3]
Packit Service 084de1
	xor	@D[4],@C[1]
Packit Service 084de1
	rol	\$$rhotates[4][2],@C[4]
Packit Service 084de1
	rol	\$$rhotates[3][1],@C[3]
Packit Service 084de1
	xor	@D[0],@C[2]
Packit Service 084de1
	rol	\$$rhotates[1][4],@C[1]
Packit Service 084de1
	 mov	@C[0],@T[0]
Packit Service 084de1
	 or	@C[4],@C[0]
Packit Service 084de1
	rol	\$$rhotates[2][0],@C[2]
Packit Service 084de1
Packit Service 084de1
	xor	@C[3],@C[0]		#           C[3] ^ (C[0] |  C[4])
Packit Service 084de1
	mov	@C[0],$A[1][3](%rsi)	# R[1][3] = C[3] ^ (C[0] |  C[4])
Packit Service 084de1
Packit Service 084de1
	mov	@C[1],@T[1]
Packit Service 084de1
	and	@T[0],@C[1]
Packit Service 084de1
	  mov	$A[0][1](%rdi),@C[0]
Packit Service 084de1
	xor	@C[4],@C[1]		#           C[4] ^ (C[1] &  C[0])
Packit Service 084de1
	not	@C[4]
Packit Service 084de1
	mov	@C[1],$A[1][4](%rsi)	# R[1][4] = C[4] ^ (C[1] &  C[0])
Packit Service 084de1
Packit Service 084de1
	or	@C[3],@C[4]
Packit Service 084de1
	  mov	$A[1][2](%rdi),@C[1]
Packit Service 084de1
	xor	@C[2],@C[4]		#           C[2] ^ (~C[4] | C[3])
Packit Service 084de1
	mov	@C[4],$A[1][2](%rsi)	# R[1][2] = C[2] ^ (~C[4] | C[3])
Packit Service 084de1
Packit Service 084de1
	and	@C[2],@C[3]
Packit Service 084de1
	  mov	$A[4][0](%rdi),@C[4]
Packit Service 084de1
	xor	@T[1],@C[3]		#           C[1] ^ (C[3] &  C[2])
Packit Service 084de1
	mov	@C[3],$A[1][1](%rsi)	# R[1][1] = C[1] ^ (C[3] &  C[2])
Packit Service 084de1
Packit Service 084de1
	or	@C[2],@T[1]
Packit Service 084de1
	  mov	$A[2][3](%rdi),@C[2]
Packit Service 084de1
	xor	@T[0],@T[1]		#           C[0] ^ (C[1] |  C[2])
Packit Service 084de1
	  mov	$A[3][4](%rdi),@C[3]
Packit Service 084de1
	mov	@T[1],$A[1][0](%rsi)	# R[1][0] = C[0] ^ (C[1] |  C[2])
Packit Service 084de1
Packit Service 084de1
Packit Service 084de1
	xor	@D[3],@C[2]
Packit Service 084de1
	xor	@D[4],@C[3]
Packit Service 084de1
	rol	\$$rhotates[2][3],@C[2]
Packit Service 084de1
	xor	@D[2],@C[1]
Packit Service 084de1
	rol	\$$rhotates[3][4],@C[3]
Packit Service 084de1
	xor	@D[0],@C[4]
Packit Service 084de1
	rol	\$$rhotates[1][2],@C[1]
Packit Service 084de1
	xor	@D[1],@C[0]
Packit Service 084de1
	rol	\$$rhotates[4][0],@C[4]
Packit Service 084de1
	 mov	@C[2],@T[0]
Packit Service 084de1
	 and	@C[3],@C[2]
Packit Service 084de1
	rol	\$$rhotates[0][1],@C[0]
Packit Service 084de1
Packit Service 084de1
	not	@C[3]
Packit Service 084de1
	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] & C[3])
Packit Service 084de1
	mov	@C[2],$A[2][1](%rsi)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
Packit Service 084de1
Packit Service 084de1
	mov	@C[4],@T[1]
Packit Service 084de1
	and	@C[3],@C[4]
Packit Service 084de1
	  mov	$A[2][1](%rdi),@C[2]
Packit Service 084de1
	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] & ~C[3])
Packit Service 084de1
	mov	@C[4],$A[2][2](%rsi)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
Packit Service 084de1
Packit Service 084de1
	or	@C[1],@T[0]
Packit Service 084de1
	  mov	$A[4][3](%rdi),@C[4]
Packit Service 084de1
	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] | C[1])
Packit Service 084de1
	mov	@T[0],$A[2][0](%rsi)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
Packit Service 084de1
Packit Service 084de1
	and	@C[0],@C[1]
Packit Service 084de1
	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] & C[0])
Packit Service 084de1
	mov	@C[1],$A[2][4](%rsi)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
Packit Service 084de1
Packit Service 084de1
	or	@C[0],@T[1]
Packit Service 084de1
	  mov	$A[1][0](%rdi),@C[1]
Packit Service 084de1
	xor	@C[3],@T[1]		#           ~C[3] ^ ( C[0] | C[4])
Packit Service 084de1
	  mov	$A[3][2](%rdi),@C[3]
Packit Service 084de1
	mov	@T[1],$A[2][3](%rsi)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
Packit Service 084de1
Packit Service 084de1
Packit Service 084de1
	mov	$A[0][4](%rdi),@C[0]
Packit Service 084de1
Packit Service 084de1
	xor	@D[1],@C[2]
Packit Service 084de1
	xor	@D[2],@C[3]
Packit Service 084de1
	rol	\$$rhotates[2][1],@C[2]
Packit Service 084de1
	xor	@D[0],@C[1]
Packit Service 084de1
	rol	\$$rhotates[3][2],@C[3]
Packit Service 084de1
	xor	@D[3],@C[4]
Packit Service 084de1
	rol	\$$rhotates[1][0],@C[1]
Packit Service 084de1
	xor	@D[4],@C[0]
Packit Service 084de1
	rol	\$$rhotates[4][3],@C[4]
Packit Service 084de1
	 mov	@C[2],@T[0]
Packit Service 084de1
	 or	@C[3],@C[2]
Packit Service 084de1
	rol	\$$rhotates[0][4],@C[0]
Packit Service 084de1
Packit Service 084de1
	not	@C[3]
Packit Service 084de1
	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] | C[3])
Packit Service 084de1
	mov	@C[2],$A[3][1](%rsi)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
Packit Service 084de1
Packit Service 084de1
	mov	@C[4],@T[1]
Packit Service 084de1
	or	@C[3],@C[4]
Packit Service 084de1
	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] | ~C[3])
Packit Service 084de1
	mov	@C[4],$A[3][2](%rsi)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
Packit Service 084de1
Packit Service 084de1
	and	@C[1],@T[0]
Packit Service 084de1
	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] & C[1])
Packit Service 084de1
	mov	@T[0],$A[3][0](%rsi)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
Packit Service 084de1
Packit Service 084de1
	or	@C[0],@C[1]
Packit Service 084de1
	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] | C[0])
Packit Service 084de1
	mov	@C[1],$A[3][4](%rsi)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
Packit Service 084de1
Packit Service 084de1
	and	@T[1],@C[0]
Packit Service 084de1
	xor	@C[3],@C[0]		#           ~C[3] ^ ( C[0] & C[4])
Packit Service 084de1
	mov	@C[0],$A[3][3](%rsi)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
Packit Service 084de1
Packit Service 084de1
Packit Service 084de1
	xor	$A[0][2](%rdi),@D[2]
Packit Service 084de1
	xor	$A[1][3](%rdi),@D[3]
Packit Service 084de1
	rol	\$$rhotates[0][2],@D[2]
Packit Service 084de1
	xor	$A[4][1](%rdi),@D[1]
Packit Service 084de1
	rol	\$$rhotates[1][3],@D[3]
Packit Service 084de1
	xor	$A[2][4](%rdi),@D[4]
Packit Service 084de1
	rol	\$$rhotates[4][1],@D[1]
Packit Service 084de1
	xor	$A[3][0](%rdi),@D[0]
Packit Service 084de1
	xchg	%rsi,%rdi
Packit Service 084de1
	rol	\$$rhotates[2][4],@D[4]
Packit Service 084de1
	rol	\$$rhotates[3][0],@D[0]
Packit Service 084de1
___
Packit Service 084de1
	@C = @D[2..4,0,1];
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
	mov	@C[0],@T[0]
Packit Service 084de1
	and	@C[1],@C[0]
Packit Service 084de1
	not	@C[1]
Packit Service 084de1
	xor	@C[4],@C[0]		#            C[4] ^ ( C[0] & C[1])
Packit Service 084de1
	mov	@C[0],$A[4][4](%rdi)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
Packit Service 084de1
Packit Service 084de1
	mov	@C[2],@T[1]
Packit Service 084de1
	and	@C[1],@C[2]
Packit Service 084de1
	xor	@T[0],@C[2]		#            C[0] ^ ( C[2] & ~C[1])
Packit Service 084de1
	mov	@C[2],$A[4][0](%rdi)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
Packit Service 084de1
Packit Service 084de1
	or	@C[4],@T[0]
Packit Service 084de1
	xor	@C[3],@T[0]		#            C[3] ^ ( C[0] | C[4])
Packit Service 084de1
	mov	@T[0],$A[4][3](%rdi)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
Packit Service 084de1
Packit Service 084de1
	and	@C[3],@C[4]
Packit Service 084de1
	xor	@T[1],@C[4]		#            C[2] ^ ( C[4] & C[3])
Packit Service 084de1
	mov	@C[4],$A[4][2](%rdi)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
Packit Service 084de1
Packit Service 084de1
	or	@T[1],@C[3]
Packit Service 084de1
	xor	@C[1],@C[3]		#           ~C[1] ^ ( C[2] | C[3])
Packit Service 084de1
	mov	@C[3],$A[4][1](%rdi)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
Packit Service 084de1
Packit Service 084de1
	mov	@C[0],@C[1]		# harmonize with the loop top
Packit Service 084de1
	mov	@T[0],@C[0]
Packit Service 084de1
Packit Service 084de1
	test	\$255,$iotas
Packit Service 084de1
	jnz	.Loop
Packit Service 084de1
Packit Service 084de1
	lea	-192($iotas),$iotas	# rewind iotas
Packit Service 084de1
	ret
Packit Service 084de1
.cfi_endproc
Packit Service 084de1
.size	__KeccakF1600,.-__KeccakF1600
Packit Service 084de1
Packit Service 084de1
.type	KeccakF1600,\@abi-omnipotent
Packit Service 084de1
.align	32
Packit Service 084de1
KeccakF1600:
Packit Service 084de1
.cfi_startproc
Packit Service 084de1
	push	%rbx
Packit Service 084de1
.cfi_push	%rbx
Packit Service 084de1
	push	%rbp
Packit Service 084de1
.cfi_push	%rbp
Packit Service 084de1
	push	%r12
Packit Service 084de1
.cfi_push	%r12
Packit Service 084de1
	push	%r13
Packit Service 084de1
.cfi_push	%r13
Packit Service 084de1
	push	%r14
Packit Service 084de1
.cfi_push	%r14
Packit Service 084de1
	push	%r15
Packit Service 084de1
.cfi_push	%r15
Packit Service 084de1
Packit Service 084de1
	lea	100(%rdi),%rdi		# size optimization
Packit Service 084de1
	sub	\$200,%rsp
Packit Service 084de1
.cfi_adjust_cfa_offset	200
Packit Service 084de1
Packit Service 084de1
	notq	$A[0][1](%rdi)
Packit Service 084de1
	notq	$A[0][2](%rdi)
Packit Service 084de1
	notq	$A[1][3](%rdi)
Packit Service 084de1
	notq	$A[2][2](%rdi)
Packit Service 084de1
	notq	$A[3][2](%rdi)
Packit Service 084de1
	notq	$A[4][0](%rdi)
Packit Service 084de1
Packit Service 084de1
	lea	iotas(%rip),$iotas
Packit Service 084de1
	lea	100(%rsp),%rsi		# size optimization
Packit Service 084de1
Packit Service 084de1
	call	__KeccakF1600
Packit Service 084de1
Packit Service 084de1
	notq	$A[0][1](%rdi)
Packit Service 084de1
	notq	$A[0][2](%rdi)
Packit Service 084de1
	notq	$A[1][3](%rdi)
Packit Service 084de1
	notq	$A[2][2](%rdi)
Packit Service 084de1
	notq	$A[3][2](%rdi)
Packit Service 084de1
	notq	$A[4][0](%rdi)
Packit Service 084de1
	lea	-100(%rdi),%rdi		# preserve A[][]
Packit Service 084de1
Packit Service 084de1
	add	\$200,%rsp
Packit Service 084de1
.cfi_adjust_cfa_offset	-200
Packit Service 084de1
Packit Service 084de1
	pop	%r15
Packit Service 084de1
.cfi_pop	%r15
Packit Service 084de1
	pop	%r14
Packit Service 084de1
.cfi_pop	%r14
Packit Service 084de1
	pop	%r13
Packit Service 084de1
.cfi_pop	%r13
Packit Service 084de1
	pop	%r12
Packit Service 084de1
.cfi_pop	%r12
Packit Service 084de1
	pop	%rbp
Packit Service 084de1
.cfi_pop	%rbp
Packit Service 084de1
	pop	%rbx
Packit Service 084de1
.cfi_pop	%rbx
Packit Service 084de1
	ret
Packit Service 084de1
.cfi_endproc
Packit Service 084de1
.size	KeccakF1600,.-KeccakF1600
Packit Service 084de1
___
Packit Service 084de1
Packit Service 084de1
{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
Packit Service 084de1
     ($A_flat,$inp) = ("%r8","%r9");
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.globl	SHA3_absorb
Packit Service 084de1
.type	SHA3_absorb,\@function,4
Packit Service 084de1
.align	32
Packit Service 084de1
SHA3_absorb:
Packit Service 084de1
.cfi_startproc
Packit Service 084de1
	push	%rbx
Packit Service 084de1
.cfi_push	%rbx
Packit Service 084de1
	push	%rbp
Packit Service 084de1
.cfi_push	%rbp
Packit Service 084de1
	push	%r12
Packit Service 084de1
.cfi_push	%r12
Packit Service 084de1
	push	%r13
Packit Service 084de1
.cfi_push	%r13
Packit Service 084de1
	push	%r14
Packit Service 084de1
.cfi_push	%r14
Packit Service 084de1
	push	%r15
Packit Service 084de1
.cfi_push	%r15
Packit Service 084de1
Packit Service 084de1
	lea	100(%rdi),%rdi		# size optimization
Packit Service 084de1
	sub	\$232,%rsp
Packit Service 084de1
.cfi_adjust_cfa_offset	232
Packit Service 084de1
Packit Service 084de1
	mov	%rsi,$inp
Packit Service 084de1
	lea	100(%rsp),%rsi		# size optimization
Packit Service 084de1
Packit Service 084de1
	notq	$A[0][1](%rdi)
Packit Service 084de1
	notq	$A[0][2](%rdi)
Packit Service 084de1
	notq	$A[1][3](%rdi)
Packit Service 084de1
	notq	$A[2][2](%rdi)
Packit Service 084de1
	notq	$A[3][2](%rdi)
Packit Service 084de1
	notq	$A[4][0](%rdi)
Packit Service 084de1
	lea	iotas(%rip),$iotas
Packit Service 084de1
Packit Service 084de1
	mov	$bsz,216-100(%rsi)	# save bsz
Packit Service 084de1
Packit Service 084de1
.Loop_absorb:
Packit Service 084de1
	cmp	$bsz,$len
Packit Service 084de1
	jc	.Ldone_absorb
Packit Service 084de1
Packit Service 084de1
	shr	\$3,$bsz
Packit Service 084de1
	lea	-100(%rdi),$A_flat
Packit Service 084de1
Packit Service 084de1
.Lblock_absorb:
Packit Service 084de1
	mov	($inp),%rax
Packit Service 084de1
	lea	8($inp),$inp
Packit Service 084de1
	xor	($A_flat),%rax
Packit Service 084de1
	lea	8($A_flat),$A_flat
Packit Service 084de1
	sub	\$8,$len
Packit Service 084de1
	mov	%rax,-8($A_flat)
Packit Service 084de1
	sub	\$1,$bsz
Packit Service 084de1
	jnz	.Lblock_absorb
Packit Service 084de1
Packit Service 084de1
	mov	$inp,200-100(%rsi)	# save inp
Packit Service 084de1
	mov	$len,208-100(%rsi)	# save len
Packit Service 084de1
	call	__KeccakF1600
Packit Service 084de1
	mov	200-100(%rsi),$inp	# pull inp
Packit Service 084de1
	mov	208-100(%rsi),$len	# pull len
Packit Service 084de1
	mov	216-100(%rsi),$bsz	# pull bsz
Packit Service 084de1
	jmp	.Loop_absorb
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Ldone_absorb:
Packit Service 084de1
	mov	$len,%rax		# return value
Packit Service 084de1
Packit Service 084de1
	notq	$A[0][1](%rdi)
Packit Service 084de1
	notq	$A[0][2](%rdi)
Packit Service 084de1
	notq	$A[1][3](%rdi)
Packit Service 084de1
	notq	$A[2][2](%rdi)
Packit Service 084de1
	notq	$A[3][2](%rdi)
Packit Service 084de1
	notq	$A[4][0](%rdi)
Packit Service 084de1
Packit Service 084de1
	add	\$232,%rsp
Packit Service 084de1
.cfi_adjust_cfa_offset	-232
Packit Service 084de1
Packit Service 084de1
	pop	%r15
Packit Service 084de1
.cfi_pop	%r15
Packit Service 084de1
	pop	%r14
Packit Service 084de1
.cfi_pop	%r14
Packit Service 084de1
	pop	%r13
Packit Service 084de1
.cfi_pop	%r13
Packit Service 084de1
	pop	%r12
Packit Service 084de1
.cfi_pop	%r12
Packit Service 084de1
	pop	%rbp
Packit Service 084de1
.cfi_pop	%rbp
Packit Service 084de1
	pop	%rbx
Packit Service 084de1
.cfi_pop	%rbx
Packit Service 084de1
	ret
Packit Service 084de1
.cfi_endproc
Packit Service 084de1
.size	SHA3_absorb,.-SHA3_absorb
Packit Service 084de1
___
Packit Service 084de1
}
Packit Service 084de1
{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
Packit Service 084de1
     ($out,$len,$bsz) = ("%r12","%r13","%r14");
Packit Service 084de1
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.globl	SHA3_squeeze
Packit Service 084de1
.type	SHA3_squeeze,\@function,4
Packit Service 084de1
.align	32
Packit Service 084de1
SHA3_squeeze:
Packit Service 084de1
.cfi_startproc
Packit Service 084de1
	push	%r12
Packit Service 084de1
.cfi_push	%r12
Packit Service 084de1
	push	%r13
Packit Service 084de1
.cfi_push	%r13
Packit Service 084de1
	push	%r14
Packit Service 084de1
.cfi_push	%r14
Packit Service 084de1
Packit Service 084de1
	shr	\$3,%rcx
Packit Service 084de1
	mov	$A_flat,%r8
Packit Service 084de1
	mov	%rsi,$out
Packit Service 084de1
	mov	%rdx,$len
Packit Service 084de1
	mov	%rcx,$bsz
Packit Service 084de1
	jmp	.Loop_squeeze
Packit Service 084de1
Packit Service 084de1
.align	32
Packit Service 084de1
.Loop_squeeze:
Packit Service 084de1
	cmp	\$8,$len
Packit Service 084de1
	jb	.Ltail_squeeze
Packit Service 084de1
Packit Service 084de1
	mov	(%r8),%rax
Packit Service 084de1
	lea	8(%r8),%r8
Packit Service 084de1
	mov	%rax,($out)
Packit Service 084de1
	lea	8($out),$out
Packit Service 084de1
	sub	\$8,$len		# len -= 8
Packit Service 084de1
	jz	.Ldone_squeeze
Packit Service 084de1
Packit Service 084de1
	sub	\$1,%rcx		# bsz--
Packit Service 084de1
	jnz	.Loop_squeeze
Packit Service 084de1
Packit Service 084de1
	call	KeccakF1600
Packit Service 084de1
	mov	$A_flat,%r8
Packit Service 084de1
	mov	$bsz,%rcx
Packit Service 084de1
	jmp	.Loop_squeeze
Packit Service 084de1
Packit Service 084de1
.Ltail_squeeze:
Packit Service 084de1
	mov	%r8, %rsi
Packit Service 084de1
	mov	$out,%rdi
Packit Service 084de1
	mov	$len,%rcx
Packit Service 084de1
	.byte	0xf3,0xa4		# rep	movsb
Packit Service 084de1
Packit Service 084de1
.Ldone_squeeze:
Packit Service 084de1
	pop	%r14
Packit Service 084de1
.cfi_pop	%r14
Packit Service 084de1
	pop	%r13
Packit Service 084de1
.cfi_pop	%r13
Packit Service 084de1
	pop	%r12
Packit Service 084de1
.cfi_pop	%r13
Packit Service 084de1
	ret
Packit Service 084de1
.cfi_endproc
Packit Service 084de1
.size	SHA3_squeeze,.-SHA3_squeeze
Packit Service 084de1
___
Packit Service 084de1
}
Packit Service 084de1
$code.=<<___;
Packit Service 084de1
.align	256
Packit Service 084de1
	.quad	0,0,0,0,0,0,0,0
Packit Service 084de1
.type	iotas,\@object
Packit Service 084de1
iotas:
Packit Service 084de1
	.quad	0x0000000000000001
Packit Service 084de1
	.quad	0x0000000000008082
Packit Service 084de1
	.quad	0x800000000000808a
Packit Service 084de1
	.quad	0x8000000080008000
Packit Service 084de1
	.quad	0x000000000000808b
Packit Service 084de1
	.quad	0x0000000080000001
Packit Service 084de1
	.quad	0x8000000080008081
Packit Service 084de1
	.quad	0x8000000000008009
Packit Service 084de1
	.quad	0x000000000000008a
Packit Service 084de1
	.quad	0x0000000000000088
Packit Service 084de1
	.quad	0x0000000080008009
Packit Service 084de1
	.quad	0x000000008000000a
Packit Service 084de1
	.quad	0x000000008000808b
Packit Service 084de1
	.quad	0x800000000000008b
Packit Service 084de1
	.quad	0x8000000000008089
Packit Service 084de1
	.quad	0x8000000000008003
Packit Service 084de1
	.quad	0x8000000000008002
Packit Service 084de1
	.quad	0x8000000000000080
Packit Service 084de1
	.quad	0x000000000000800a
Packit Service 084de1
	.quad	0x800000008000000a
Packit Service 084de1
	.quad	0x8000000080008081
Packit Service 084de1
	.quad	0x8000000000008080
Packit Service 084de1
	.quad	0x0000000080000001
Packit Service 084de1
	.quad	0x8000000080008008
Packit Service 084de1
.size	iotas,.-iotas
Packit Service 084de1
.asciz	"Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
Packit Service 084de1
___
Packit Service 084de1
Packit Service 084de1
foreach (split("\n",$code)) {
Packit Service 084de1
	# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
Packit Service 084de1
	# Haswell, but it hurts other processors by up to 2-3-4x...
Packit Service 084de1
	#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
Packit Service 084de1
	# Below replacement results in 9.3 on Haswell [as well as
Packit Service 084de1
	# on Ryzen, i.e. it *hurts* Ryzen]...
Packit Service 084de1
	#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
Packit Service 084de1
Packit Service 084de1
	print $_, "\n";
Packit Service 084de1
}
Packit Service 084de1
Packit Service 084de1
close STDOUT or die "error closing STDOUT: $!";