Blame crypto/sha/asm/keccak1600-armv8.pl

Packit c4476c
#!/usr/bin/env perl
Packit c4476c
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# Keccak-1600 for ARMv8.
Packit c4476c
#
Packit c4476c
# June 2017.
Packit c4476c
#
Packit c4476c
# This is straightforward KECCAK_1X_ALT implementation. It makes no
Packit c4476c
# sense to attempt SIMD/NEON implementation for following reason.
Packit c4476c
# 64-bit lanes of vector registers can't be addressed as easily as in
Packit c4476c
# 32-bit mode. This means that 64-bit NEON is bound to be slower than
Packit c4476c
# 32-bit NEON, and this implementation is faster than 32-bit NEON on
Packit c4476c
# same processor. Even though it takes more scalar xor's and andn's,
Packit c4476c
# it gets compensated by availability of rotate. Not to forget that
Packit c4476c
# most processors achieve higher issue rate with scalar instructions.
Packit c4476c
#
Packit c4476c
# February 2018.
Packit c4476c
#
Packit c4476c
# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
Packit c4476c
# variant with register permutation/rotation twist that allows to
Packit c4476c
# eliminate copies to temporary registers. If you look closely you'll
Packit c4476c
# notice that it uses only one lane of vector registers. The new
Packit c4476c
# instructions effectively facilitate parallel hashing, which we don't
Packit c4476c
# support [yet?]. But lowest-level core procedure is prepared for it.
Packit c4476c
# The inner round is 67 [vector] instructions, so it's not actually
Packit c4476c
# obvious that it will provide performance improvement [in serial
Packit c4476c
# hash] as long as vector instructions issue rate is limited to 1 per
Packit c4476c
# cycle...
Packit c4476c
#
Packit c4476c
######################################################################
Packit c4476c
# Numbers are cycles per processed byte.
Packit c4476c
#
Packit c4476c
#		r=1088(*)
Packit c4476c
#
Packit c4476c
# Cortex-A53	13
Packit c4476c
# Cortex-A57	12
Packit c4476c
# X-Gene	14
Packit c4476c
# Mongoose	10
Packit c4476c
# Kryo		12
Packit c4476c
# Denver	7.8
Packit c4476c
# Apple A7	7.2
Packit c4476c
# ThunderX2	9.7
Packit c4476c
#
Packit c4476c
# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
Packit c4476c
#	because they vary too much from compiler to compiler. Newer
Packit c4476c
#	compiler does much better and improvement varies from 5% on
Packit c4476c
#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
Packit c4476c
#	compiler this code is at least 2x faster...
Packit c4476c
Packit c4476c
$flavour = shift;
Packit c4476c
$output  = shift;
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Packit c4476c
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
Packit c4476c
die "can't locate arm-xlate.pl";
Packit c4476c
Packit c4476c
open OUT,"| \"$^X\" $xlate $flavour $output";
Packit c4476c
*STDOUT=*OUT;
Packit c4476c
Packit c4476c
my @rhotates = ([  0,  1, 62, 28, 27 ],
Packit c4476c
                [ 36, 44,  6, 55, 20 ],
Packit c4476c
                [  3, 10, 43, 25, 39 ],
Packit c4476c
                [ 41, 45, 15, 21,  8 ],
Packit c4476c
                [ 18,  2, 61, 56, 14 ]);
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.text
Packit c4476c
Packit c4476c
.align 8	// strategic alignment and padding that allows to use
Packit c4476c
		// address value as loop termination condition...
Packit c4476c
	.quad	0,0,0,0,0,0,0,0
Packit c4476c
.type	iotas,%object
Packit c4476c
iotas:
Packit c4476c
	.quad	0x0000000000000001
Packit c4476c
	.quad	0x0000000000008082
Packit c4476c
	.quad	0x800000000000808a
Packit c4476c
	.quad	0x8000000080008000
Packit c4476c
	.quad	0x000000000000808b
Packit c4476c
	.quad	0x0000000080000001
Packit c4476c
	.quad	0x8000000080008081
Packit c4476c
	.quad	0x8000000000008009
Packit c4476c
	.quad	0x000000000000008a
Packit c4476c
	.quad	0x0000000000000088
Packit c4476c
	.quad	0x0000000080008009
Packit c4476c
	.quad	0x000000008000000a
Packit c4476c
	.quad	0x000000008000808b
Packit c4476c
	.quad	0x800000000000008b
Packit c4476c
	.quad	0x8000000000008089
Packit c4476c
	.quad	0x8000000000008003
Packit c4476c
	.quad	0x8000000000008002
Packit c4476c
	.quad	0x8000000000000080
Packit c4476c
	.quad	0x000000000000800a
Packit c4476c
	.quad	0x800000008000000a
Packit c4476c
	.quad	0x8000000080008081
Packit c4476c
	.quad	0x8000000000008080
Packit c4476c
	.quad	0x0000000080000001
Packit c4476c
	.quad	0x8000000080008008
Packit c4476c
.size	iotas,.-iotas
Packit c4476c
___
Packit c4476c
								{{{
Packit c4476c
my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
Packit c4476c
            (0, 5, 10, 15, 20));
Packit c4476c
   $A[3][3] = "x25"; # x18 is reserved
Packit c4476c
Packit c4476c
my @C = map("x$_", (26,27,28,30));
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.type	KeccakF1600_int,%function
Packit c4476c
.align	5
Packit c4476c
KeccakF1600_int:
Packit c4476c
	adr	$C[2],iotas
Packit c4476c
	.inst	0xd503233f			// paciasp
Packit c4476c
	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
Packit c4476c
	b	.Loop
Packit c4476c
.align	4
Packit c4476c
.Loop:
Packit c4476c
	////////////////////////////////////////// Theta
Packit c4476c
	eor	$C[0],$A[0][0],$A[1][0]
Packit c4476c
	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
Packit c4476c
	eor	$C[1],$A[0][1],$A[1][1]
Packit c4476c
	eor	$C[2],$A[0][2],$A[1][2]
Packit c4476c
	eor	$C[3],$A[0][3],$A[1][3]
Packit c4476c
___
Packit c4476c
	$C[4]=$A[0][4];
Packit c4476c
	$C[5]=$A[1][4];
Packit c4476c
$code.=<<___;
Packit c4476c
	eor	$C[4],$A[0][4],$A[1][4]
Packit c4476c
	eor	$C[0],$C[0],$A[2][0]
Packit c4476c
	eor	$C[1],$C[1],$A[2][1]
Packit c4476c
	eor	$C[2],$C[2],$A[2][2]
Packit c4476c
	eor	$C[3],$C[3],$A[2][3]
Packit c4476c
	eor	$C[4],$C[4],$A[2][4]
Packit c4476c
	eor	$C[0],$C[0],$A[3][0]
Packit c4476c
	eor	$C[1],$C[1],$A[3][1]
Packit c4476c
	eor	$C[2],$C[2],$A[3][2]
Packit c4476c
	eor	$C[3],$C[3],$A[3][3]
Packit c4476c
	eor	$C[4],$C[4],$A[3][4]
Packit c4476c
	eor	$C[0],$C[0],$A[4][0]
Packit c4476c
	eor	$C[2],$C[2],$A[4][2]
Packit c4476c
	eor	$C[1],$C[1],$A[4][1]
Packit c4476c
	eor	$C[3],$C[3],$A[4][3]
Packit c4476c
	eor	$C[4],$C[4],$A[4][4]
Packit c4476c
Packit c4476c
	eor	$C[5],$C[0],$C[2],ror#63
Packit c4476c
Packit c4476c
	eor	$A[0][1],$A[0][1],$C[5]
Packit c4476c
	eor	$A[1][1],$A[1][1],$C[5]
Packit c4476c
	eor	$A[2][1],$A[2][1],$C[5]
Packit c4476c
	eor	$A[3][1],$A[3][1],$C[5]
Packit c4476c
	eor	$A[4][1],$A[4][1],$C[5]
Packit c4476c
Packit c4476c
	eor	$C[5],$C[1],$C[3],ror#63
Packit c4476c
	eor	$C[2],$C[2],$C[4],ror#63
Packit c4476c
	eor	$C[3],$C[3],$C[0],ror#63
Packit c4476c
	eor	$C[4],$C[4],$C[1],ror#63
Packit c4476c
Packit c4476c
	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
Packit c4476c
	eor	$A[1][2],$A[1][2],$C[5]
Packit c4476c
	eor	$A[2][2],$A[2][2],$C[5]
Packit c4476c
	eor	$A[3][2],$A[3][2],$C[5]
Packit c4476c
	eor	$A[4][2],$A[4][2],$C[5]
Packit c4476c
Packit c4476c
	eor	$A[0][0],$A[0][0],$C[4]
Packit c4476c
	eor	$A[1][0],$A[1][0],$C[4]
Packit c4476c
	eor	$A[2][0],$A[2][0],$C[4]
Packit c4476c
	eor	$A[3][0],$A[3][0],$C[4]
Packit c4476c
	eor	$A[4][0],$A[4][0],$C[4]
Packit c4476c
___
Packit c4476c
	$C[4]=undef;
Packit c4476c
	$C[5]=undef;
Packit c4476c
$code.=<<___;
Packit c4476c
	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
Packit c4476c
	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
Packit c4476c
	eor	$A[1][3],$A[1][3],$C[2]
Packit c4476c
	eor	$A[2][3],$A[2][3],$C[2]
Packit c4476c
	eor	$A[3][3],$A[3][3],$C[2]
Packit c4476c
	eor	$A[4][3],$A[4][3],$C[2]
Packit c4476c
Packit c4476c
	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
Packit c4476c
	eor	$A[1][4],$A[1][4],$C[3]
Packit c4476c
	eor	$A[2][4],$A[2][4],$C[3]
Packit c4476c
	eor	$A[3][4],$A[3][4],$C[3]
Packit c4476c
	eor	$A[4][4],$A[4][4],$C[3]
Packit c4476c
Packit c4476c
	////////////////////////////////////////// Rho+Pi
Packit c4476c
	mov	$C[3],$A[0][1]
Packit c4476c
	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
Packit c4476c
	//mov	$C[1],$A[0][2]
Packit c4476c
	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
Packit c4476c
	//mov	$C[0],$A[0][3]
Packit c4476c
	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
Packit c4476c
	//mov	$C[2],$A[0][4]
Packit c4476c
	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
Packit c4476c
Packit c4476c
	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
Packit c4476c
	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
Packit c4476c
	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
Packit c4476c
	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
Packit c4476c
Packit c4476c
	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
Packit c4476c
	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
Packit c4476c
	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
Packit c4476c
	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
Packit c4476c
Packit c4476c
	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
Packit c4476c
	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
Packit c4476c
	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
Packit c4476c
	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
Packit c4476c
Packit c4476c
	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
Packit c4476c
	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
Packit c4476c
	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
Packit c4476c
	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
Packit c4476c
Packit c4476c
	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
Packit c4476c
	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
Packit c4476c
	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
Packit c4476c
	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
Packit c4476c
Packit c4476c
	////////////////////////////////////////// Chi+Iota
Packit c4476c
	bic	$C[0],$A[0][2],$A[0][1]
Packit c4476c
	bic	$C[1],$A[0][3],$A[0][2]
Packit c4476c
	bic	$C[2],$A[0][0],$A[0][4]
Packit c4476c
	bic	$C[3],$A[0][1],$A[0][0]
Packit c4476c
	eor	$A[0][0],$A[0][0],$C[0]
Packit c4476c
	bic	$C[0],$A[0][4],$A[0][3]
Packit c4476c
	eor	$A[0][1],$A[0][1],$C[1]
Packit c4476c
	 ldr	$C[1],[sp,#16]
Packit c4476c
	eor	$A[0][3],$A[0][3],$C[2]
Packit c4476c
	eor	$A[0][4],$A[0][4],$C[3]
Packit c4476c
	eor	$A[0][2],$A[0][2],$C[0]
Packit c4476c
	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
Packit c4476c
Packit c4476c
	bic	$C[0],$A[1][2],$A[1][1]
Packit c4476c
	 tst	$C[1],#255			// are we done?
Packit c4476c
	 str	$C[1],[sp,#16]
Packit c4476c
	bic	$C[1],$A[1][3],$A[1][2]
Packit c4476c
	bic	$C[2],$A[1][0],$A[1][4]
Packit c4476c
	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
Packit c4476c
	bic	$C[3],$A[1][1],$A[1][0]
Packit c4476c
	eor	$A[1][0],$A[1][0],$C[0]
Packit c4476c
	bic	$C[0],$A[1][4],$A[1][3]
Packit c4476c
	eor	$A[1][1],$A[1][1],$C[1]
Packit c4476c
	eor	$A[1][3],$A[1][3],$C[2]
Packit c4476c
	eor	$A[1][4],$A[1][4],$C[3]
Packit c4476c
	eor	$A[1][2],$A[1][2],$C[0]
Packit c4476c
Packit c4476c
	bic	$C[0],$A[2][2],$A[2][1]
Packit c4476c
	bic	$C[1],$A[2][3],$A[2][2]
Packit c4476c
	bic	$C[2],$A[2][0],$A[2][4]
Packit c4476c
	bic	$C[3],$A[2][1],$A[2][0]
Packit c4476c
	eor	$A[2][0],$A[2][0],$C[0]
Packit c4476c
	bic	$C[0],$A[2][4],$A[2][3]
Packit c4476c
	eor	$A[2][1],$A[2][1],$C[1]
Packit c4476c
	eor	$A[2][3],$A[2][3],$C[2]
Packit c4476c
	eor	$A[2][4],$A[2][4],$C[3]
Packit c4476c
	eor	$A[2][2],$A[2][2],$C[0]
Packit c4476c
Packit c4476c
	bic	$C[0],$A[3][2],$A[3][1]
Packit c4476c
	bic	$C[1],$A[3][3],$A[3][2]
Packit c4476c
	bic	$C[2],$A[3][0],$A[3][4]
Packit c4476c
	bic	$C[3],$A[3][1],$A[3][0]
Packit c4476c
	eor	$A[3][0],$A[3][0],$C[0]
Packit c4476c
	bic	$C[0],$A[3][4],$A[3][3]
Packit c4476c
	eor	$A[3][1],$A[3][1],$C[1]
Packit c4476c
	eor	$A[3][3],$A[3][3],$C[2]
Packit c4476c
	eor	$A[3][4],$A[3][4],$C[3]
Packit c4476c
	eor	$A[3][2],$A[3][2],$C[0]
Packit c4476c
Packit c4476c
	bic	$C[0],$A[4][2],$A[4][1]
Packit c4476c
	bic	$C[1],$A[4][3],$A[4][2]
Packit c4476c
	bic	$C[2],$A[4][0],$A[4][4]
Packit c4476c
	bic	$C[3],$A[4][1],$A[4][0]
Packit c4476c
	eor	$A[4][0],$A[4][0],$C[0]
Packit c4476c
	bic	$C[0],$A[4][4],$A[4][3]
Packit c4476c
	eor	$A[4][1],$A[4][1],$C[1]
Packit c4476c
	eor	$A[4][3],$A[4][3],$C[2]
Packit c4476c
	eor	$A[4][4],$A[4][4],$C[3]
Packit c4476c
	eor	$A[4][2],$A[4][2],$C[0]
Packit c4476c
Packit c4476c
	bne	.Loop
Packit c4476c
Packit c4476c
	ldr	x30,[sp,#24]
Packit c4476c
	.inst	0xd50323bf			// autiasp
Packit c4476c
	ret
Packit c4476c
.size	KeccakF1600_int,.-KeccakF1600_int
Packit c4476c
Packit c4476c
.type	KeccakF1600,%function
Packit c4476c
.align	5
Packit c4476c
KeccakF1600:
Packit c4476c
	.inst	0xd503233f			// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-128]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
	stp	x19,x20,[sp,#16]
Packit c4476c
	stp	x21,x22,[sp,#32]
Packit c4476c
	stp	x23,x24,[sp,#48]
Packit c4476c
	stp	x25,x26,[sp,#64]
Packit c4476c
	stp	x27,x28,[sp,#80]
Packit c4476c
	sub	sp,sp,#48
Packit c4476c
Packit c4476c
	str	x0,[sp,#32]			// offload argument
Packit c4476c
	mov	$C[0],x0
Packit c4476c
	ldp	$A[0][0],$A[0][1],[x0,#16*0]
Packit c4476c
	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
Packit c4476c
	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
Packit c4476c
	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
Packit c4476c
	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
Packit c4476c
	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
Packit c4476c
	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
Packit c4476c
	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
Packit c4476c
	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
Packit c4476c
	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
Packit c4476c
	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
Packit c4476c
	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
Packit c4476c
	ldr	$A[4][4],[$C[0],#16*12]
Packit c4476c
Packit c4476c
	bl	KeccakF1600_int
Packit c4476c
Packit c4476c
	ldr	$C[0],[sp,#32]
Packit c4476c
	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
Packit c4476c
	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
Packit c4476c
	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
Packit c4476c
	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
Packit c4476c
	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
Packit c4476c
	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
Packit c4476c
	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
Packit c4476c
	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
Packit c4476c
	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
Packit c4476c
	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
Packit c4476c
	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
Packit c4476c
	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
Packit c4476c
	str	$A[4][4],[$C[0],#16*12]
Packit c4476c
Packit c4476c
	ldp	x19,x20,[x29,#16]
Packit c4476c
	add	sp,sp,#48
Packit c4476c
	ldp	x21,x22,[x29,#32]
Packit c4476c
	ldp	x23,x24,[x29,#48]
Packit c4476c
	ldp	x25,x26,[x29,#64]
Packit c4476c
	ldp	x27,x28,[x29,#80]
Packit c4476c
	ldp	x29,x30,[sp],#128
Packit c4476c
	.inst	0xd50323bf			// autiasp
Packit c4476c
	ret
Packit c4476c
.size	KeccakF1600,.-KeccakF1600
Packit c4476c
Packit c4476c
.globl	SHA3_absorb
Packit c4476c
.type	SHA3_absorb,%function
Packit c4476c
.align	5
Packit c4476c
SHA3_absorb:
Packit c4476c
	.inst	0xd503233f			// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-128]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
	stp	x19,x20,[sp,#16]
Packit c4476c
	stp	x21,x22,[sp,#32]
Packit c4476c
	stp	x23,x24,[sp,#48]
Packit c4476c
	stp	x25,x26,[sp,#64]
Packit c4476c
	stp	x27,x28,[sp,#80]
Packit c4476c
	sub	sp,sp,#64
Packit c4476c
Packit c4476c
	stp	x0,x1,[sp,#32]			// offload arguments
Packit c4476c
	stp	x2,x3,[sp,#48]
Packit c4476c
Packit c4476c
	mov	$C[0],x0			// uint64_t A[5][5]
Packit c4476c
	mov	$C[1],x1			// const void *inp
Packit c4476c
	mov	$C[2],x2			// size_t len
Packit c4476c
	mov	$C[3],x3			// size_t bsz
Packit c4476c
	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
Packit c4476c
	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
Packit c4476c
	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
Packit c4476c
	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
Packit c4476c
	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
Packit c4476c
	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
Packit c4476c
	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
Packit c4476c
	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
Packit c4476c
	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
Packit c4476c
	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
Packit c4476c
	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
Packit c4476c
	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
Packit c4476c
	ldr	$A[4][4],[$C[0],#16*12]
Packit c4476c
	b	.Loop_absorb
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_absorb:
Packit c4476c
	subs	$C[0],$C[2],$C[3]		// len - bsz
Packit c4476c
	blo	.Labsorbed
Packit c4476c
Packit c4476c
	str	$C[0],[sp,#48]			// save len - bsz
Packit c4476c
___
Packit c4476c
for (my $i=0; $i<24; $i+=2) {
Packit c4476c
my $j = $i+1;
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr	$C[0],[$C[1]],#8		// *inp++
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev	$C[0],$C[0]
Packit c4476c
#endif
Packit c4476c
	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
Packit c4476c
	cmp	$C[3],#8*($i+2)
Packit c4476c
	blo	.Lprocess_block
Packit c4476c
	ldr	$C[0],[$C[1]],#8		// *inp++
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev	$C[0],$C[0]
Packit c4476c
#endif
Packit c4476c
	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
Packit c4476c
	beq	.Lprocess_block
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr	$C[0],[$C[1]],#8		// *inp++
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev	$C[0],$C[0]
Packit c4476c
#endif
Packit c4476c
	eor	$A[4][4],$A[4][4],$C[0]
Packit c4476c
Packit c4476c
.Lprocess_block:
Packit c4476c
	str	$C[1],[sp,#40]			// save inp
Packit c4476c
Packit c4476c
	bl	KeccakF1600_int
Packit c4476c
Packit c4476c
	ldr	$C[1],[sp,#40]			// restore arguments
Packit c4476c
	ldp	$C[2],$C[3],[sp,#48]
Packit c4476c
	b	.Loop_absorb
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Labsorbed:
Packit c4476c
	ldr	$C[1],[sp,#32]
Packit c4476c
	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
Packit c4476c
	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
Packit c4476c
	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
Packit c4476c
	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
Packit c4476c
	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
Packit c4476c
	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
Packit c4476c
	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
Packit c4476c
	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
Packit c4476c
	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
Packit c4476c
	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
Packit c4476c
	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
Packit c4476c
	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
Packit c4476c
	str	$A[4][4],[$C[1],#16*12]
Packit c4476c
Packit c4476c
	mov	x0,$C[2]			// return value
Packit c4476c
	ldp	x19,x20,[x29,#16]
Packit c4476c
	add	sp,sp,#64
Packit c4476c
	ldp	x21,x22,[x29,#32]
Packit c4476c
	ldp	x23,x24,[x29,#48]
Packit c4476c
	ldp	x25,x26,[x29,#64]
Packit c4476c
	ldp	x27,x28,[x29,#80]
Packit c4476c
	ldp	x29,x30,[sp],#128
Packit c4476c
	.inst	0xd50323bf			// autiasp
Packit c4476c
	ret
Packit c4476c
.size	SHA3_absorb,.-SHA3_absorb
Packit c4476c
___
Packit c4476c
{
Packit c4476c
my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_squeeze
Packit c4476c
.type	SHA3_squeeze,%function
Packit c4476c
.align	5
Packit c4476c
SHA3_squeeze:
Packit c4476c
	.inst	0xd503233f			// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-48]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
	stp	x19,x20,[sp,#16]
Packit c4476c
	stp	x21,x22,[sp,#32]
Packit c4476c
Packit c4476c
	mov	$A_flat,x0			// put aside arguments
Packit c4476c
	mov	$out,x1
Packit c4476c
	mov	$len,x2
Packit c4476c
	mov	$bsz,x3
Packit c4476c
Packit c4476c
.Loop_squeeze:
Packit c4476c
	ldr	x4,[x0],#8
Packit c4476c
	cmp	$len,#8
Packit c4476c
	blo	.Lsqueeze_tail
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev	x4,x4
Packit c4476c
#endif
Packit c4476c
	str	x4,[$out],#8
Packit c4476c
	subs	$len,$len,#8
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
Packit c4476c
	subs	x3,x3,#8
Packit c4476c
	bhi	.Loop_squeeze
Packit c4476c
Packit c4476c
	mov	x0,$A_flat
Packit c4476c
	bl	KeccakF1600
Packit c4476c
	mov	x0,$A_flat
Packit c4476c
	mov	x3,$bsz
Packit c4476c
	b	.Loop_squeeze
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lsqueeze_tail:
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
Packit c4476c
.Lsqueeze_done:
Packit c4476c
	ldp	x19,x20,[sp,#16]
Packit c4476c
	ldp	x21,x22,[sp,#32]
Packit c4476c
	ldp	x29,x30,[sp],#48
Packit c4476c
	.inst	0xd50323bf			// autiasp
Packit c4476c
	ret
Packit c4476c
.size	SHA3_squeeze,.-SHA3_squeeze
Packit c4476c
___
Packit c4476c
}								}}}
Packit c4476c
								{{{
Packit c4476c
my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
Packit c4476c
                             "v".($_+3).".16b", "v".($_+4).".16b" ],
Packit c4476c
            (0, 5, 10, 15, 20));
Packit c4476c
Packit c4476c
my @C = map("v$_.16b", (25..31));
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.type	KeccakF1600_ce,%function
Packit c4476c
.align	5
Packit c4476c
KeccakF1600_ce:
Packit c4476c
	mov	x9,#12
Packit c4476c
	adr	x10,iotas
Packit c4476c
	b	.Loop_ce
Packit c4476c
.align	4
Packit c4476c
.Loop_ce:
Packit c4476c
___
Packit c4476c
for($i=0; $i<2; $i++) {
Packit c4476c
$code.=<<___;
Packit c4476c
	////////////////////////////////////////////////// Theta
Packit c4476c
	eor3	$C[0],$A[0][0],$A[1][0],$A[2][0]
Packit c4476c
	eor3	$C[1],$A[0][1],$A[1][1],$A[2][1]
Packit c4476c
	eor3	$C[2],$A[0][2],$A[1][2],$A[2][2]
Packit c4476c
	eor3	$C[3],$A[0][3],$A[1][3],$A[2][3]
Packit c4476c
	eor3	$C[4],$A[0][4],$A[1][4],$A[2][4]
Packit c4476c
	eor3	$C[0],$C[0],   $A[3][0],$A[4][0]
Packit c4476c
	eor3	$C[1],$C[1],   $A[3][1],$A[4][1]
Packit c4476c
	eor3	$C[2],$C[2],   $A[3][2],$A[4][2]
Packit c4476c
	eor3	$C[3],$C[3],   $A[3][3],$A[4][3]
Packit c4476c
	eor3	$C[4],$C[4],   $A[3][4],$A[4][4]
Packit c4476c
Packit c4476c
	rax1	$C[5],$C[0],$C[2]			// D[1]
Packit c4476c
	rax1	$C[6],$C[1],$C[3]			// D[2]
Packit c4476c
	rax1	$C[2],$C[2],$C[4]			// D[3]
Packit c4476c
	rax1	$C[3],$C[3],$C[0]			// D[4]
Packit c4476c
	rax1	$C[4],$C[4],$C[1]			// D[0]
Packit c4476c
Packit c4476c
	////////////////////////////////////////////////// Theta+Rho+Pi
Packit c4476c
	xar	$C[0],   $A[1][1],$C[5],#64-$rhotates[1][1]	// C[0]=A[0][1]
Packit c4476c
	xar	$A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
Packit c4476c
	xar	$A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
Packit c4476c
	xar	$A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
Packit c4476c
	xar	$A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
Packit c4476c
Packit c4476c
	xar	$A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
Packit c4476c
Packit c4476c
	xar	$A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
Packit c4476c
	xar	$A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
Packit c4476c
	xar	$A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
Packit c4476c
	xar	$A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
Packit c4476c
	xar	$A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
Packit c4476c
Packit c4476c
	xar	$A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
Packit c4476c
Packit c4476c
	eor	$A[0][0],$A[0][0],$C[4]
Packit c4476c
	ldr	x11,[x10],#8
Packit c4476c
Packit c4476c
	xar	$C[1],   $A[3][3],$C[2],#64-$rhotates[3][3]	// C[1]=A[0][3]
Packit c4476c
	xar	$A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
Packit c4476c
	xar	$A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
Packit c4476c
	xar	$A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
Packit c4476c
	xar	$A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
Packit c4476c
Packit c4476c
	xar	$A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1]	// *
Packit c4476c
Packit c4476c
	xar	$A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
Packit c4476c
	xar	$A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
Packit c4476c
	xar	$A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
Packit c4476c
	xar	$A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
Packit c4476c
	xar	$A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
Packit c4476c
Packit c4476c
	xar	$C[2],   $A[0][3],$C[2],#64-$rhotates[0][3]	// C[2]=A[1][0]
Packit c4476c
Packit c4476c
	////////////////////////////////////////////////// Chi+Iota
Packit c4476c
	dup	$C[6],x11				// borrow C[6]
Packit c4476c
	bcax	$C[3],   $A[0][0],$A[0][2],$C[0]	// *
Packit c4476c
	bcax	$A[0][1],$C[0],   $C[1],   $A[0][2]	// *
Packit c4476c
	bcax	$A[0][2],$A[0][2],$A[0][4],$C[1]
Packit c4476c
	bcax	$A[0][3],$C[1],   $A[0][0],$A[0][4]
Packit c4476c
	bcax	$A[0][4],$A[0][4],$C[0],   $A[0][0]
Packit c4476c
Packit c4476c
	bcax	$A[1][0],$C[2],   $A[1][2],$A[1][1]	// *
Packit c4476c
	bcax	$C[0],   $A[1][1],$A[1][3],$A[1][2]	// *
Packit c4476c
	bcax	$A[1][2],$A[1][2],$A[1][4],$A[1][3]
Packit c4476c
	bcax	$A[1][3],$A[1][3],$C[2],   $A[1][4]
Packit c4476c
	bcax	$A[1][4],$A[1][4],$A[1][1],$C[2]
Packit c4476c
Packit c4476c
	eor	$A[0][0],$C[3],$C[6]			// Iota
Packit c4476c
Packit c4476c
	bcax	$C[1],   $A[2][0],$A[2][2],$A[2][1]	// *
Packit c4476c
	bcax	$C[2],   $A[2][1],$A[2][3],$A[2][2]	// *
Packit c4476c
	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
Packit c4476c
	bcax	$A[2][3],$A[2][3],$A[2][0],$A[2][4]
Packit c4476c
	bcax	$A[2][4],$A[2][4],$A[2][1],$A[2][0]
Packit c4476c
Packit c4476c
	bcax	$C[3],   $A[3][0],$A[3][2],$A[3][1]	// *
Packit c4476c
	bcax	$C[4],   $A[3][1],$A[3][3],$A[3][2]	// *
Packit c4476c
	bcax	$A[3][2],$A[3][2],$A[3][4],$A[3][3]
Packit c4476c
	bcax	$A[3][3],$A[3][3],$A[3][0],$A[3][4]
Packit c4476c
	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
Packit c4476c
Packit c4476c
	bcax	$C[5],   $A[4][0],$A[4][2],$A[4][1]	// *
Packit c4476c
	bcax	$C[6],   $A[4][1],$A[4][3],$A[4][2]	// *
Packit c4476c
	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
Packit c4476c
	bcax	$A[4][3],$A[4][3],$A[4][0],$A[4][4]
Packit c4476c
	bcax	$A[4][4],$A[4][4],$A[4][1],$A[4][0]
Packit c4476c
___
Packit c4476c
	(         $A[1][1],       $C[0]) = (      $C[0],          $A[1][1]);
Packit c4476c
	($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
Packit c4476c
	($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
Packit c4476c
	($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	subs	x9,x9,#1
Packit c4476c
	bne	.Loop_ce
Packit c4476c
Packit c4476c
	ret
Packit c4476c
.size	KeccakF1600_ce,.-KeccakF1600_ce
Packit c4476c
Packit c4476c
.type	KeccakF1600_cext,%function
Packit c4476c
.align	5
Packit c4476c
KeccakF1600_cext:
Packit c4476c
	.inst	0xd503233f		// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-80]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
	stp	d8,d9,[sp,#16]		// per ABI requirement
Packit c4476c
	stp	d10,d11,[sp,#32]
Packit c4476c
	stp	d12,d13,[sp,#48]
Packit c4476c
	stp	d14,d15,[sp,#64]
Packit c4476c
___
Packit c4476c
for($i=0; $i<24; $i+=2) {		# load A[5][5]
Packit c4476c
my $j=$i+1;
Packit c4476c
$code.=<<___;
Packit c4476c
	ldp	d$i,d$j,[x0,#8*$i]
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr	d24,[x0,#8*$i]
Packit c4476c
	bl	KeccakF1600_ce
Packit c4476c
	ldr	x30,[sp,#8]
Packit c4476c
___
Packit c4476c
for($i=0; $i<24; $i+=2) {		# store A[5][5]
Packit c4476c
my $j=$i+1;
Packit c4476c
$code.=<<___;
Packit c4476c
	stp	d$i,d$j,[x0,#8*$i]
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	str	d24,[x0,#8*$i]
Packit c4476c
Packit c4476c
	ldp	d8,d9,[sp,#16]
Packit c4476c
	ldp	d10,d11,[sp,#32]
Packit c4476c
	ldp	d12,d13,[sp,#48]
Packit c4476c
	ldp	d14,d15,[sp,#64]
Packit c4476c
	ldr	x29,[sp],#80
Packit c4476c
	.inst	0xd50323bf		// autiasp
Packit c4476c
	ret
Packit c4476c
.size	KeccakF1600_cext,.-KeccakF1600_cext
Packit c4476c
___
Packit c4476c
Packit c4476c
{
Packit c4476c
my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_absorb_cext
Packit c4476c
.type	SHA3_absorb_cext,%function
Packit c4476c
.align	5
Packit c4476c
SHA3_absorb_cext:
Packit c4476c
	.inst	0xd503233f		// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-80]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
	stp	d8,d9,[sp,#16]		// per ABI requirement
Packit c4476c
	stp	d10,d11,[sp,#32]
Packit c4476c
	stp	d12,d13,[sp,#48]
Packit c4476c
	stp	d14,d15,[sp,#64]
Packit c4476c
___
Packit c4476c
for($i=0; $i<24; $i+=2) {		# load A[5][5]
Packit c4476c
my $j=$i+1;
Packit c4476c
$code.=<<___;
Packit c4476c
	ldp	d$i,d$j,[x0,#8*$i]
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr	d24,[x0,#8*$i]
Packit c4476c
	b	.Loop_absorb_ce
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_absorb_ce:
Packit c4476c
	subs	$len,$len,$bsz		// len - bsz
Packit c4476c
	blo	.Labsorbed_ce
Packit c4476c
___
Packit c4476c
for (my $i=0; $i<24; $i+=2) {
Packit c4476c
my $j = $i+1;
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr	d31,[$inp],#8		// *inp++
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev64	v31.16b,v31.16b
Packit c4476c
#endif
Packit c4476c
	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
Packit c4476c
	cmp	$bsz,#8*($i+2)
Packit c4476c
	blo	.Lprocess_block_ce
Packit c4476c
	ldr	d31,[$inp],#8		// *inp++
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev64	v31.16b,v31.16b
Packit c4476c
#endif
Packit c4476c
	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
Packit c4476c
	beq	.Lprocess_block_ce
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr	d31,[$inp],#8		// *inp++
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev64	v31.16b,v31.16b
Packit c4476c
#endif
Packit c4476c
	eor	$A[4][4],$A[4][4],v31.16b
Packit c4476c
Packit c4476c
.Lprocess_block_ce:
Packit c4476c
Packit c4476c
	bl	KeccakF1600_ce
Packit c4476c
Packit c4476c
	b	.Loop_absorb_ce
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Labsorbed_ce:
Packit c4476c
___
Packit c4476c
for($i=0; $i<24; $i+=2) {		# store A[5][5]
Packit c4476c
my $j=$i+1;
Packit c4476c
$code.=<<___;
Packit c4476c
	stp	d$i,d$j,[x0,#8*$i]
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	str	d24,[x0,#8*$i]
Packit c4476c
	add	x0,$len,$bsz		// return value
Packit c4476c
Packit c4476c
	ldp	d8,d9,[sp,#16]
Packit c4476c
	ldp	d10,d11,[sp,#32]
Packit c4476c
	ldp	d12,d13,[sp,#48]
Packit c4476c
	ldp	d14,d15,[sp,#64]
Packit c4476c
	ldp	x29,x30,[sp],#80
Packit c4476c
	.inst	0xd50323bf		// autiasp
Packit c4476c
	ret
Packit c4476c
.size	SHA3_absorb_cext,.-SHA3_absorb_cext
Packit c4476c
___
Packit c4476c
}
Packit c4476c
{
Packit c4476c
my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_squeeze_cext
Packit c4476c
.type	SHA3_squeeze_cext,%function
Packit c4476c
.align	5
Packit c4476c
SHA3_squeeze_cext:
Packit c4476c
	.inst	0xd503233f		// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-16]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
	mov	x9,$ctx
Packit c4476c
	mov	x10,$bsz
Packit c4476c
Packit c4476c
.Loop_squeeze_ce:
Packit c4476c
	ldr	x4,[x9],#8
Packit c4476c
	cmp	$len,#8
Packit c4476c
	blo	.Lsqueeze_tail_ce
Packit c4476c
#ifdef	__AARCH64EB__
Packit c4476c
	rev	x4,x4
Packit c4476c
#endif
Packit c4476c
	str	x4,[$out],#8
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
Packit c4476c
	sub	$len,$len,#8
Packit c4476c
	subs	x10,x10,#8
Packit c4476c
	bhi	.Loop_squeeze_ce
Packit c4476c
Packit c4476c
	bl	KeccakF1600_cext
Packit c4476c
	ldr	x30,[sp,#8]
Packit c4476c
	mov	x9,$ctx
Packit c4476c
	mov	x10,$bsz
Packit c4476c
	b	.Loop_squeeze_ce
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lsqueeze_tail_ce:
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
	lsr	x4,x4,#8
Packit c4476c
	subs	$len,$len,#1
Packit c4476c
	beq	.Lsqueeze_done_ce
Packit c4476c
	strb	w4,[$out],#1
Packit c4476c
Packit c4476c
.Lsqueeze_done_ce:
Packit c4476c
	ldr	x29,[sp],#16
Packit c4476c
	.inst	0xd50323bf		// autiasp
Packit c4476c
	ret
Packit c4476c
.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
Packit c4476c
___
Packit c4476c
}								}}}
Packit c4476c
$code.=<<___;
Packit c4476c
.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
___
Packit c4476c
Packit c4476c
{   my  %opcode = (
Packit c4476c
	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
Packit c4476c
	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
Packit c4476c
Packit c4476c
    sub unsha3 {
Packit c4476c
	my ($mnemonic,$arg)=@_;
Packit c4476c
Packit c4476c
	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
Packit c4476c
	&&
Packit c4476c
	sprintf ".inst\t0x%08x\t//%s %s",
Packit c4476c
			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
Packit c4476c
			$mnemonic,$arg;
Packit c4476c
    }
Packit c4476c
}
Packit c4476c
Packit c4476c
foreach(split("\n",$code)) {
Packit c4476c
Packit c4476c
	s/\`([^\`]*)\`/eval($1)/ge;
Packit c4476c
Packit c4476c
	m/\bdup\b/ and s/\.16b/.2d/g	or
Packit c4476c
	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
Packit c4476c
Packit c4476c
	print $_,"\n";
Packit c4476c
}
Packit c4476c
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";