Blame crypto/sha/asm/keccak1600-ppc64.pl

Packit c4476c
#!/usr/bin/env perl
Packit c4476c
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# Keccak-1600 for PPC64.
Packit c4476c
#
Packit c4476c
# June 2017.
Packit c4476c
#
Packit c4476c
# This is straightforward KECCAK_1X_ALT implementation that works on
Packit c4476c
# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
Packit c4476c
# it's possible to achieve performance better than below, but that is
Packit c4476c
# naturally option only for POWER8 and successors...
Packit c4476c
#
Packit c4476c
######################################################################
Packit c4476c
# Numbers are cycles per processed byte.
Packit c4476c
#
Packit c4476c
#		r=1088(*)
Packit c4476c
#
Packit c4476c
# PPC970/G5	14.6/+120%
Packit c4476c
# POWER7	10.3/+100%
Packit c4476c
# POWER8	11.5/+85%
Packit c4476c
# POWER9	9.4/+45%
Packit c4476c
#
Packit c4476c
# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
Packit c4476c
#	over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
Packit c4476c
#	much better (but watch out for them generating code specific
Packit c4476c
#	to processor they execute on).
Packit c4476c
Packit c4476c
$flavour = shift;
Packit c4476c
Packit c4476c
if ($flavour =~ /64/) {
Packit c4476c
	$SIZE_T	=8;
Packit c4476c
	$LRSAVE	=2*$SIZE_T;
Packit c4476c
	$UCMP	="cmpld";
Packit c4476c
	$STU	="stdu";
Packit c4476c
	$POP	="ld";
Packit c4476c
	$PUSH	="std";
Packit c4476c
} else { die "nonsense $flavour"; }
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
Packit c4476c
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
Packit c4476c
die "can't locate ppc-xlate.pl";
Packit c4476c
Packit c4476c
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
Packit c4476c
Packit c4476c
$FRAME=24*$SIZE_T+6*$SIZE_T+32;
Packit c4476c
$LOCALS=6*$SIZE_T;
Packit c4476c
$TEMP=$LOCALS+6*$SIZE_T;
Packit c4476c
Packit c4476c
my $sp ="r1";
Packit c4476c
Packit c4476c
my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
Packit c4476c
            (7, 12, 17, 22, 27));
Packit c4476c
   $A[1][1] = "r6"; # r13 is reserved
Packit c4476c
Packit c4476c
my @C = map("r$_", (0,3,4,5));
Packit c4476c
Packit c4476c
my @rhotates = ([  0,  1, 62, 28, 27 ],
Packit c4476c
                [ 36, 44,  6, 55, 20 ],
Packit c4476c
                [  3, 10, 43, 25, 39 ],
Packit c4476c
                [ 41, 45, 15, 21,  8 ],
Packit c4476c
                [ 18,  2, 61, 56, 14 ]);
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.text
Packit c4476c
Packit c4476c
.type	KeccakF1600_int,\@function
Packit c4476c
.align	5
Packit c4476c
KeccakF1600_int:
Packit c4476c
	li	r0,24
Packit c4476c
	mtctr	r0
Packit c4476c
	b	.Loop
Packit c4476c
.align	4
Packit c4476c
.Loop:
Packit c4476c
	xor	$C[0],$A[0][0],$A[1][0]		; Theta
Packit c4476c
	std	$A[0][4],`$TEMP+0`($sp)
Packit c4476c
	xor	$C[1],$A[0][1],$A[1][1]
Packit c4476c
	std	$A[1][4],`$TEMP+8`($sp)
Packit c4476c
	xor	$C[2],$A[0][2],$A[1][2]
Packit c4476c
	std	$A[2][4],`$TEMP+16`($sp)
Packit c4476c
	xor	$C[3],$A[0][3],$A[1][3]
Packit c4476c
	std	$A[3][4],`$TEMP+24`($sp)
Packit c4476c
___
Packit c4476c
	$C[4]=$A[0][4];
Packit c4476c
	$C[5]=$A[1][4];
Packit c4476c
	$C[6]=$A[2][4];
Packit c4476c
	$C[7]=$A[3][4];
Packit c4476c
$code.=<<___;
Packit c4476c
	xor	$C[4],$A[0][4],$A[1][4]
Packit c4476c
	xor	$C[0],$C[0],$A[2][0]
Packit c4476c
	xor	$C[1],$C[1],$A[2][1]
Packit c4476c
	xor	$C[2],$C[2],$A[2][2]
Packit c4476c
	xor	$C[3],$C[3],$A[2][3]
Packit c4476c
	xor	$C[4],$C[4],$A[2][4]
Packit c4476c
	xor	$C[0],$C[0],$A[3][0]
Packit c4476c
	xor	$C[1],$C[1],$A[3][1]
Packit c4476c
	xor	$C[2],$C[2],$A[3][2]
Packit c4476c
	xor	$C[3],$C[3],$A[3][3]
Packit c4476c
	xor	$C[4],$C[4],$A[3][4]
Packit c4476c
	xor	$C[0],$C[0],$A[4][0]
Packit c4476c
	xor	$C[2],$C[2],$A[4][2]
Packit c4476c
	xor	$C[1],$C[1],$A[4][1]
Packit c4476c
	xor	$C[3],$C[3],$A[4][3]
Packit c4476c
	rotldi	$C[5],$C[2],1
Packit c4476c
	xor	$C[4],$C[4],$A[4][4]
Packit c4476c
	rotldi	$C[6],$C[3],1
Packit c4476c
	xor	$C[5],$C[5],$C[0]
Packit c4476c
	rotldi	$C[7],$C[4],1
Packit c4476c
Packit c4476c
	xor	$A[0][1],$A[0][1],$C[5]
Packit c4476c
	xor	$A[1][1],$A[1][1],$C[5]
Packit c4476c
	xor	$A[2][1],$A[2][1],$C[5]
Packit c4476c
	xor	$A[3][1],$A[3][1],$C[5]
Packit c4476c
	xor	$A[4][1],$A[4][1],$C[5]
Packit c4476c
Packit c4476c
	rotldi	$C[5],$C[0],1
Packit c4476c
	xor	$C[6],$C[6],$C[1]
Packit c4476c
	xor	$C[2],$C[2],$C[7]
Packit c4476c
	rotldi	$C[7],$C[1],1
Packit c4476c
	xor	$C[3],$C[3],$C[5]
Packit c4476c
	xor	$C[4],$C[4],$C[7]
Packit c4476c
Packit c4476c
	xor	$C[1],   $A[0][2],$C[6]			;mr	$C[1],$A[0][2]
Packit c4476c
	xor	$A[1][2],$A[1][2],$C[6]
Packit c4476c
	xor	$A[2][2],$A[2][2],$C[6]
Packit c4476c
	xor	$A[3][2],$A[3][2],$C[6]
Packit c4476c
	xor	$A[4][2],$A[4][2],$C[6]
Packit c4476c
Packit c4476c
	xor	$A[0][0],$A[0][0],$C[4]
Packit c4476c
	xor	$A[1][0],$A[1][0],$C[4]
Packit c4476c
	xor	$A[2][0],$A[2][0],$C[4]
Packit c4476c
	xor	$A[3][0],$A[3][0],$C[4]
Packit c4476c
	xor	$A[4][0],$A[4][0],$C[4]
Packit c4476c
___
Packit c4476c
	$C[4]=undef;
Packit c4476c
	$C[5]=undef;
Packit c4476c
	$C[6]=undef;
Packit c4476c
	$C[7]=undef;
Packit c4476c
$code.=<<___;
Packit c4476c
	ld	$A[0][4],`$TEMP+0`($sp)
Packit c4476c
	xor	$C[0],   $A[0][3],$C[2]			;mr	$C[0],$A[0][3]
Packit c4476c
	ld	$A[1][4],`$TEMP+8`($sp)
Packit c4476c
	xor	$A[1][3],$A[1][3],$C[2]
Packit c4476c
	ld	$A[2][4],`$TEMP+16`($sp)
Packit c4476c
	xor	$A[2][3],$A[2][3],$C[2]
Packit c4476c
	ld	$A[3][4],`$TEMP+24`($sp)
Packit c4476c
	xor	$A[3][3],$A[3][3],$C[2]
Packit c4476c
	xor	$A[4][3],$A[4][3],$C[2]
Packit c4476c
Packit c4476c
	xor	$C[2],   $A[0][4],$C[3]			;mr	$C[2],$A[0][4]
Packit c4476c
	xor	$A[1][4],$A[1][4],$C[3]
Packit c4476c
	xor	$A[2][4],$A[2][4],$C[3]
Packit c4476c
	xor	$A[3][4],$A[3][4],$C[3]
Packit c4476c
	xor	$A[4][4],$A[4][4],$C[3]
Packit c4476c
Packit c4476c
	mr	$C[3],$A[0][1]				; Rho+Pi
Packit c4476c
	rotldi	$A[0][1],$A[1][1],$rhotates[1][1]
Packit c4476c
	;mr	$C[1],$A[0][2]
Packit c4476c
	rotldi	$A[0][2],$A[2][2],$rhotates[2][2]
Packit c4476c
	;mr	$C[0],$A[0][3]
Packit c4476c
	rotldi	$A[0][3],$A[3][3],$rhotates[3][3]
Packit c4476c
	;mr	$C[2],$A[0][4]
Packit c4476c
	rotldi	$A[0][4],$A[4][4],$rhotates[4][4]
Packit c4476c
Packit c4476c
	rotldi	$A[1][1],$A[1][4],$rhotates[1][4]
Packit c4476c
	rotldi	$A[2][2],$A[2][3],$rhotates[2][3]
Packit c4476c
	rotldi	$A[3][3],$A[3][2],$rhotates[3][2]
Packit c4476c
	rotldi	$A[4][4],$A[4][1],$rhotates[4][1]
Packit c4476c
Packit c4476c
	rotldi	$A[1][4],$A[4][2],$rhotates[4][2]
Packit c4476c
	rotldi	$A[2][3],$A[3][4],$rhotates[3][4]
Packit c4476c
	rotldi	$A[3][2],$A[2][1],$rhotates[2][1]
Packit c4476c
	rotldi	$A[4][1],$A[1][3],$rhotates[1][3]
Packit c4476c
Packit c4476c
	rotldi	$A[4][2],$A[2][4],$rhotates[2][4]
Packit c4476c
	rotldi	$A[3][4],$A[4][3],$rhotates[4][3]
Packit c4476c
	rotldi	$A[2][1],$A[1][2],$rhotates[1][2]
Packit c4476c
	rotldi	$A[1][3],$A[3][1],$rhotates[3][1]
Packit c4476c
Packit c4476c
	rotldi	$A[2][4],$A[4][0],$rhotates[4][0]
Packit c4476c
	rotldi	$A[4][3],$A[3][0],$rhotates[3][0]
Packit c4476c
	rotldi	$A[1][2],$A[2][0],$rhotates[2][0]
Packit c4476c
	rotldi	$A[3][1],$A[1][0],$rhotates[1][0]
Packit c4476c
Packit c4476c
	rotldi	$A[1][0],$C[0],$rhotates[0][3]
Packit c4476c
	rotldi	$A[2][0],$C[3],$rhotates[0][1]
Packit c4476c
	rotldi	$A[3][0],$C[2],$rhotates[0][4]
Packit c4476c
	rotldi	$A[4][0],$C[1],$rhotates[0][2]
Packit c4476c
Packit c4476c
	andc	$C[0],$A[0][2],$A[0][1]			; Chi+Iota
Packit c4476c
	andc	$C[1],$A[0][3],$A[0][2]
Packit c4476c
	andc	$C[2],$A[0][0],$A[0][4]
Packit c4476c
	andc	$C[3],$A[0][1],$A[0][0]
Packit c4476c
	xor	$A[0][0],$A[0][0],$C[0]
Packit c4476c
	andc	$C[0],$A[0][4],$A[0][3]
Packit c4476c
	xor	$A[0][1],$A[0][1],$C[1]
Packit c4476c
	 ld	$C[1],`$LOCALS+4*$SIZE_T`($sp)
Packit c4476c
	xor	$A[0][3],$A[0][3],$C[2]
Packit c4476c
	xor	$A[0][4],$A[0][4],$C[3]
Packit c4476c
	xor	$A[0][2],$A[0][2],$C[0]
Packit c4476c
	 ldu	$C[3],8($C[1])				; Iota[i++]
Packit c4476c
Packit c4476c
	andc	$C[0],$A[1][2],$A[1][1]
Packit c4476c
	 std	$C[1],`$LOCALS+4*$SIZE_T`($sp)
Packit c4476c
	andc	$C[1],$A[1][3],$A[1][2]
Packit c4476c
	andc	$C[2],$A[1][0],$A[1][4]
Packit c4476c
	 xor	$A[0][0],$A[0][0],$C[3]			; A[0][0] ^= Iota
Packit c4476c
	andc	$C[3],$A[1][1],$A[1][0]
Packit c4476c
	xor	$A[1][0],$A[1][0],$C[0]
Packit c4476c
	andc	$C[0],$A[1][4],$A[1][3]
Packit c4476c
	xor	$A[1][1],$A[1][1],$C[1]
Packit c4476c
	xor	$A[1][3],$A[1][3],$C[2]
Packit c4476c
	xor	$A[1][4],$A[1][4],$C[3]
Packit c4476c
	xor	$A[1][2],$A[1][2],$C[0]
Packit c4476c
Packit c4476c
	andc	$C[0],$A[2][2],$A[2][1]
Packit c4476c
	andc	$C[1],$A[2][3],$A[2][2]
Packit c4476c
	andc	$C[2],$A[2][0],$A[2][4]
Packit c4476c
	andc	$C[3],$A[2][1],$A[2][0]
Packit c4476c
	xor	$A[2][0],$A[2][0],$C[0]
Packit c4476c
	andc	$C[0],$A[2][4],$A[2][3]
Packit c4476c
	xor	$A[2][1],$A[2][1],$C[1]
Packit c4476c
	xor	$A[2][3],$A[2][3],$C[2]
Packit c4476c
	xor	$A[2][4],$A[2][4],$C[3]
Packit c4476c
	xor	$A[2][2],$A[2][2],$C[0]
Packit c4476c
Packit c4476c
	andc	$C[0],$A[3][2],$A[3][1]
Packit c4476c
	andc	$C[1],$A[3][3],$A[3][2]
Packit c4476c
	andc	$C[2],$A[3][0],$A[3][4]
Packit c4476c
	andc	$C[3],$A[3][1],$A[3][0]
Packit c4476c
	xor	$A[3][0],$A[3][0],$C[0]
Packit c4476c
	andc	$C[0],$A[3][4],$A[3][3]
Packit c4476c
	xor	$A[3][1],$A[3][1],$C[1]
Packit c4476c
	xor	$A[3][3],$A[3][3],$C[2]
Packit c4476c
	xor	$A[3][4],$A[3][4],$C[3]
Packit c4476c
	xor	$A[3][2],$A[3][2],$C[0]
Packit c4476c
Packit c4476c
	andc	$C[0],$A[4][2],$A[4][1]
Packit c4476c
	andc	$C[1],$A[4][3],$A[4][2]
Packit c4476c
	andc	$C[2],$A[4][0],$A[4][4]
Packit c4476c
	andc	$C[3],$A[4][1],$A[4][0]
Packit c4476c
	xor	$A[4][0],$A[4][0],$C[0]
Packit c4476c
	andc	$C[0],$A[4][4],$A[4][3]
Packit c4476c
	xor	$A[4][1],$A[4][1],$C[1]
Packit c4476c
	xor	$A[4][3],$A[4][3],$C[2]
Packit c4476c
	xor	$A[4][4],$A[4][4],$C[3]
Packit c4476c
	xor	$A[4][2],$A[4][2],$C[0]
Packit c4476c
Packit c4476c
	bdnz	.Loop
Packit c4476c
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,0,0
Packit c4476c
.size	KeccakF1600_int,.-KeccakF1600_int
Packit c4476c
Packit c4476c
.type	KeccakF1600,\@function
Packit c4476c
.align	5
Packit c4476c
KeccakF1600:
Packit c4476c
	$STU	$sp,-$FRAME($sp)
Packit c4476c
	mflr	r0
Packit c4476c
	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
Packit c4476c
	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
Packit c4476c
	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
Packit c4476c
	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
Packit c4476c
	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
Packit c4476c
	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
Packit c4476c
	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
Packit c4476c
	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
Packit c4476c
	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
Packit c4476c
	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
Packit c4476c
	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
Packit c4476c
	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
Packit c4476c
	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
Packit c4476c
	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
Packit c4476c
	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
Packit c4476c
	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
Packit c4476c
	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
Packit c4476c
	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
Packit c4476c
	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
Packit c4476c
Packit c4476c
	bl	PICmeup
Packit c4476c
	subi	r12,r12,8			; prepare for ldu
Packit c4476c
Packit c4476c
	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)
Packit c4476c
	;$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)
Packit c4476c
	;$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)
Packit c4476c
	;$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)
Packit c4476c
	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
Packit c4476c
Packit c4476c
	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
Packit c4476c
	ld	$A[0][1],`8*1`(r3)
Packit c4476c
	ld	$A[0][2],`8*2`(r3)
Packit c4476c
	ld	$A[0][3],`8*3`(r3)
Packit c4476c
	ld	$A[0][4],`8*4`(r3)
Packit c4476c
	ld	$A[1][0],`8*5`(r3)
Packit c4476c
	ld	$A[1][1],`8*6`(r3)
Packit c4476c
	ld	$A[1][2],`8*7`(r3)
Packit c4476c
	ld	$A[1][3],`8*8`(r3)
Packit c4476c
	ld	$A[1][4],`8*9`(r3)
Packit c4476c
	ld	$A[2][0],`8*10`(r3)
Packit c4476c
	ld	$A[2][1],`8*11`(r3)
Packit c4476c
	ld	$A[2][2],`8*12`(r3)
Packit c4476c
	ld	$A[2][3],`8*13`(r3)
Packit c4476c
	ld	$A[2][4],`8*14`(r3)
Packit c4476c
	ld	$A[3][0],`8*15`(r3)
Packit c4476c
	ld	$A[3][1],`8*16`(r3)
Packit c4476c
	ld	$A[3][2],`8*17`(r3)
Packit c4476c
	ld	$A[3][3],`8*18`(r3)
Packit c4476c
	ld	$A[3][4],`8*19`(r3)
Packit c4476c
	ld	$A[4][0],`8*20`(r3)
Packit c4476c
	ld	$A[4][1],`8*21`(r3)
Packit c4476c
	ld	$A[4][2],`8*22`(r3)
Packit c4476c
	ld	$A[4][3],`8*23`(r3)
Packit c4476c
	ld	$A[4][4],`8*24`(r3)
Packit c4476c
Packit c4476c
	bl	KeccakF1600_int
Packit c4476c
Packit c4476c
	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
Packit c4476c
	std	$A[0][0],`8*0`(r3)		; return A[5][5]
Packit c4476c
	std	$A[0][1],`8*1`(r3)
Packit c4476c
	std	$A[0][2],`8*2`(r3)
Packit c4476c
	std	$A[0][3],`8*3`(r3)
Packit c4476c
	std	$A[0][4],`8*4`(r3)
Packit c4476c
	std	$A[1][0],`8*5`(r3)
Packit c4476c
	std	$A[1][1],`8*6`(r3)
Packit c4476c
	std	$A[1][2],`8*7`(r3)
Packit c4476c
	std	$A[1][3],`8*8`(r3)
Packit c4476c
	std	$A[1][4],`8*9`(r3)
Packit c4476c
	std	$A[2][0],`8*10`(r3)
Packit c4476c
	std	$A[2][1],`8*11`(r3)
Packit c4476c
	std	$A[2][2],`8*12`(r3)
Packit c4476c
	std	$A[2][3],`8*13`(r3)
Packit c4476c
	std	$A[2][4],`8*14`(r3)
Packit c4476c
	std	$A[3][0],`8*15`(r3)
Packit c4476c
	std	$A[3][1],`8*16`(r3)
Packit c4476c
	std	$A[3][2],`8*17`(r3)
Packit c4476c
	std	$A[3][3],`8*18`(r3)
Packit c4476c
	std	$A[3][4],`8*19`(r3)
Packit c4476c
	std	$A[4][0],`8*20`(r3)
Packit c4476c
	std	$A[4][1],`8*21`(r3)
Packit c4476c
	std	$A[4][2],`8*22`(r3)
Packit c4476c
	std	$A[4][3],`8*23`(r3)
Packit c4476c
	std	$A[4][4],`8*24`(r3)
Packit c4476c
Packit c4476c
	$POP	r0,`$FRAME+$LRSAVE`($sp)
Packit c4476c
	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
Packit c4476c
	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
Packit c4476c
	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
Packit c4476c
	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
Packit c4476c
	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
Packit c4476c
	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
Packit c4476c
	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
Packit c4476c
	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
Packit c4476c
	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
Packit c4476c
	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
Packit c4476c
	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
Packit c4476c
	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
Packit c4476c
	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
Packit c4476c
	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
Packit c4476c
	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
Packit c4476c
	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
Packit c4476c
	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
Packit c4476c
	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
Packit c4476c
	mtlr	r0
Packit c4476c
	addi	$sp,$sp,$FRAME
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,4,1,0x80,18,1,0
Packit c4476c
	.long	0
Packit c4476c
.size	KeccakF1600,.-KeccakF1600
Packit c4476c
Packit c4476c
.type	dword_le_load,\@function
Packit c4476c
.align	5
Packit c4476c
dword_le_load:
Packit c4476c
	lbzu	r0,1(r3)
Packit c4476c
	lbzu	r4,1(r3)
Packit c4476c
	lbzu	r5,1(r3)
Packit c4476c
	insrdi	r0,r4,8,48
Packit c4476c
	lbzu	r4,1(r3)
Packit c4476c
	insrdi	r0,r5,8,40
Packit c4476c
	lbzu	r5,1(r3)
Packit c4476c
	insrdi	r0,r4,8,32
Packit c4476c
	lbzu	r4,1(r3)
Packit c4476c
	insrdi	r0,r5,8,24
Packit c4476c
	lbzu	r5,1(r3)
Packit c4476c
	insrdi	r0,r4,8,16
Packit c4476c
	lbzu	r4,1(r3)
Packit c4476c
	insrdi	r0,r5,8,8
Packit c4476c
	insrdi	r0,r4,8,0
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,1,0
Packit c4476c
	.long	0
Packit c4476c
.size	dword_le_load,.-dword_le_load
Packit c4476c
Packit c4476c
.globl	SHA3_absorb
Packit c4476c
.type	SHA3_absorb,\@function
Packit c4476c
.align	5
Packit c4476c
SHA3_absorb:
Packit c4476c
	$STU	$sp,-$FRAME($sp)
Packit c4476c
	mflr	r0
Packit c4476c
	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
Packit c4476c
	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
Packit c4476c
	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
Packit c4476c
	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
Packit c4476c
	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
Packit c4476c
	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
Packit c4476c
	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
Packit c4476c
	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
Packit c4476c
	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
Packit c4476c
	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
Packit c4476c
	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
Packit c4476c
	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
Packit c4476c
	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
Packit c4476c
	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
Packit c4476c
	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
Packit c4476c
	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
Packit c4476c
	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
Packit c4476c
	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
Packit c4476c
	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
Packit c4476c
Packit c4476c
	bl	PICmeup
Packit c4476c
	subi	r4,r4,1				; prepare for lbzu
Packit c4476c
	subi	r12,r12,8			; prepare for ldu
Packit c4476c
Packit c4476c
	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)	; save A[][]
Packit c4476c
	$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)	; save inp
Packit c4476c
	$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)	; save len
Packit c4476c
	$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)	; save bsz
Packit c4476c
	mr	r0,r6
Packit c4476c
	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
Packit c4476c
Packit c4476c
	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
Packit c4476c
	ld	$A[0][1],`8*1`(r3)
Packit c4476c
	ld	$A[0][2],`8*2`(r3)
Packit c4476c
	ld	$A[0][3],`8*3`(r3)
Packit c4476c
	ld	$A[0][4],`8*4`(r3)
Packit c4476c
	ld	$A[1][0],`8*5`(r3)
Packit c4476c
	ld	$A[1][1],`8*6`(r3)
Packit c4476c
	ld	$A[1][2],`8*7`(r3)
Packit c4476c
	ld	$A[1][3],`8*8`(r3)
Packit c4476c
	ld	$A[1][4],`8*9`(r3)
Packit c4476c
	ld	$A[2][0],`8*10`(r3)
Packit c4476c
	ld	$A[2][1],`8*11`(r3)
Packit c4476c
	ld	$A[2][2],`8*12`(r3)
Packit c4476c
	ld	$A[2][3],`8*13`(r3)
Packit c4476c
	ld	$A[2][4],`8*14`(r3)
Packit c4476c
	ld	$A[3][0],`8*15`(r3)
Packit c4476c
	ld	$A[3][1],`8*16`(r3)
Packit c4476c
	ld	$A[3][2],`8*17`(r3)
Packit c4476c
	ld	$A[3][3],`8*18`(r3)
Packit c4476c
	ld	$A[3][4],`8*19`(r3)
Packit c4476c
	ld	$A[4][0],`8*20`(r3)
Packit c4476c
	ld	$A[4][1],`8*21`(r3)
Packit c4476c
	ld	$A[4][2],`8*22`(r3)
Packit c4476c
	ld	$A[4][3],`8*23`(r3)
Packit c4476c
	ld	$A[4][4],`8*24`(r3)
Packit c4476c
Packit c4476c
	mr	r3,r4
Packit c4476c
	mr	r4,r5
Packit c4476c
	mr	r5,r0
Packit c4476c
Packit c4476c
	b	.Loop_absorb
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_absorb:
Packit c4476c
	$UCMP	r4,r5				; len < bsz?
Packit c4476c
	blt	.Labsorbed
Packit c4476c
Packit c4476c
	sub	r4,r4,r5			; len -= bsz
Packit c4476c
	srwi	r5,r5,3
Packit c4476c
	$PUSH	r4,`$LOCALS+2*$SIZE_T`($sp)	; save len
Packit c4476c
	mtctr	r5
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[0][0],$A[0][0],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[0][1],$A[0][1],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[0][2],$A[0][2],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[0][3],$A[0][3],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[0][4],$A[0][4],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[1][0],$A[1][0],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[1][1],$A[1][1],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[1][2],$A[1][2],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[1][3],$A[1][3],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[1][4],$A[1][4],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[2][0],$A[2][0],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[2][1],$A[2][1],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[2][2],$A[2][2],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[2][3],$A[2][3],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[2][4],$A[2][4],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[3][0],$A[3][0],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[3][1],$A[3][1],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[3][2],$A[3][2],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[3][3],$A[3][3],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[3][4],$A[3][4],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[4][0],$A[4][0],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[4][1],$A[4][1],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[4][2],$A[4][2],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[4][3],$A[4][3],r0
Packit c4476c
	bdz	.Lprocess_block
Packit c4476c
	bl	dword_le_load			; *inp++
Packit c4476c
	xor	$A[4][4],$A[4][4],r0
Packit c4476c
Packit c4476c
.Lprocess_block:
Packit c4476c
	$PUSH	r3,`$LOCALS+1*$SIZE_T`($sp)	; save inp
Packit c4476c
Packit c4476c
	bl	KeccakF1600_int
Packit c4476c
Packit c4476c
	$POP	r0,`$LOCALS+4*$SIZE_T`($sp)	; pull iotas[24]
Packit c4476c
	$POP	r5,`$LOCALS+3*$SIZE_T`($sp)	; restore bsz
Packit c4476c
	$POP	r4,`$LOCALS+2*$SIZE_T`($sp)	; restore len
Packit c4476c
	$POP	r3,`$LOCALS+1*$SIZE_T`($sp)	; restore inp
Packit c4476c
	addic	r0,r0,`-8*24`			; rewind iotas
Packit c4476c
	$PUSH	r0,`$LOCALS+4*$SIZE_T`($sp)
Packit c4476c
Packit c4476c
	b	.Loop_absorb
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Labsorbed:
Packit c4476c
	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
Packit c4476c
	std	$A[0][0],`8*0`(r3)		; return A[5][5]
Packit c4476c
	std	$A[0][1],`8*1`(r3)
Packit c4476c
	std	$A[0][2],`8*2`(r3)
Packit c4476c
	std	$A[0][3],`8*3`(r3)
Packit c4476c
	std	$A[0][4],`8*4`(r3)
Packit c4476c
	std	$A[1][0],`8*5`(r3)
Packit c4476c
	std	$A[1][1],`8*6`(r3)
Packit c4476c
	std	$A[1][2],`8*7`(r3)
Packit c4476c
	std	$A[1][3],`8*8`(r3)
Packit c4476c
	std	$A[1][4],`8*9`(r3)
Packit c4476c
	std	$A[2][0],`8*10`(r3)
Packit c4476c
	std	$A[2][1],`8*11`(r3)
Packit c4476c
	std	$A[2][2],`8*12`(r3)
Packit c4476c
	std	$A[2][3],`8*13`(r3)
Packit c4476c
	std	$A[2][4],`8*14`(r3)
Packit c4476c
	std	$A[3][0],`8*15`(r3)
Packit c4476c
	std	$A[3][1],`8*16`(r3)
Packit c4476c
	std	$A[3][2],`8*17`(r3)
Packit c4476c
	std	$A[3][3],`8*18`(r3)
Packit c4476c
	std	$A[3][4],`8*19`(r3)
Packit c4476c
	std	$A[4][0],`8*20`(r3)
Packit c4476c
	std	$A[4][1],`8*21`(r3)
Packit c4476c
	std	$A[4][2],`8*22`(r3)
Packit c4476c
	std	$A[4][3],`8*23`(r3)
Packit c4476c
	std	$A[4][4],`8*24`(r3)
Packit c4476c
Packit c4476c
	mr	r3,r4				; return value
Packit c4476c
	$POP	r0,`$FRAME+$LRSAVE`($sp)
Packit c4476c
	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
Packit c4476c
	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
Packit c4476c
	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
Packit c4476c
	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
Packit c4476c
	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
Packit c4476c
	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
Packit c4476c
	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
Packit c4476c
	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
Packit c4476c
	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
Packit c4476c
	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
Packit c4476c
	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
Packit c4476c
	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
Packit c4476c
	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
Packit c4476c
	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
Packit c4476c
	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
Packit c4476c
	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
Packit c4476c
	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
Packit c4476c
	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
Packit c4476c
	mtlr	r0
Packit c4476c
	addi	$sp,$sp,$FRAME
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,4,1,0x80,18,4,0
Packit c4476c
	.long	0
Packit c4476c
.size	SHA3_absorb,.-SHA3_absorb
Packit c4476c
___
Packit c4476c
{
Packit c4476c
my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	SHA3_squeeze
Packit c4476c
.type	SHA3_squeeze,\@function
Packit c4476c
.align	5
Packit c4476c
SHA3_squeeze:
Packit c4476c
	$STU	$sp,`-10*$SIZE_T`($sp)
Packit c4476c
	mflr	r0
Packit c4476c
	$PUSH	r28,`6*$SIZE_T`($sp)
Packit c4476c
	$PUSH	r29,`7*$SIZE_T`($sp)
Packit c4476c
	$PUSH	r30,`8*$SIZE_T`($sp)
Packit c4476c
	$PUSH	r31,`9*$SIZE_T`($sp)
Packit c4476c
	$PUSH	r0,`10*$SIZE_T+$LRSAVE`($sp)
Packit c4476c
Packit c4476c
	mr	$A_flat,r3
Packit c4476c
	subi	r3,r3,8			; prepare for ldu
Packit c4476c
	subi	$out,r4,1		; prepare for stbu
Packit c4476c
	mr	$len,r5
Packit c4476c
	mr	$bsz,r6
Packit c4476c
	b	.Loop_squeeze
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_squeeze:
Packit c4476c
	ldu	r0,8(r3)
Packit c4476c
	${UCMP}i $len,8
Packit c4476c
	blt	.Lsqueeze_tail
Packit c4476c
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
Packit c4476c
	subic.	$len,$len,8
Packit c4476c
	beq	.Lsqueeze_done
Packit c4476c
Packit c4476c
	subic.	r6,r6,8
Packit c4476c
	bgt	.Loop_squeeze
Packit c4476c
Packit c4476c
	mr	r3,$A_flat
Packit c4476c
	bl	KeccakF1600
Packit c4476c
	subi	r3,$A_flat,8		; prepare for ldu
Packit c4476c
	mr	r6,$bsz
Packit c4476c
	b	.Loop_squeeze
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lsqueeze_tail:
Packit c4476c
	mtctr	$len
Packit c4476c
.Loop_tail:
Packit c4476c
	stbu	r0,1($out)
Packit c4476c
	srdi	r0,r0,8
Packit c4476c
	bdnz	.Loop_tail
Packit c4476c
Packit c4476c
.Lsqueeze_done:
Packit c4476c
	$POP	r0,`10*$SIZE_T+$LRSAVE`($sp)
Packit c4476c
	$POP	r28,`6*$SIZE_T`($sp)
Packit c4476c
	$POP	r29,`7*$SIZE_T`($sp)
Packit c4476c
	$POP	r30,`8*$SIZE_T`($sp)
Packit c4476c
	$POP	r31,`9*$SIZE_T`($sp)
Packit c4476c
	mtlr	r0
Packit c4476c
	addi	$sp,$sp,`10*$SIZE_T`
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,4,1,0x80,4,4,0
Packit c4476c
	.long	0
Packit c4476c
.size	SHA3_squeeze,.-SHA3_squeeze
Packit c4476c
___
Packit c4476c
}
Packit c4476c
Packit c4476c
# Ugly hack here, because PPC assembler syntax seem to vary too
Packit c4476c
# much from platforms to platform...
Packit c4476c
$code.=<<___;
Packit c4476c
.align	6
Packit c4476c
PICmeup:
Packit c4476c
	mflr	r0
Packit c4476c
	bcl	20,31,\$+4
Packit c4476c
	mflr	r12   ; vvvvvv "distance" between . and 1st data entry
Packit c4476c
	addi	r12,r12,`64-8`
Packit c4476c
	mtlr	r0
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,0,0
Packit c4476c
	.space	`64-9*4`
Packit c4476c
.type	iotas,\@object
Packit c4476c
iotas:
Packit c4476c
	.quad	0x0000000000000001
Packit c4476c
	.quad	0x0000000000008082
Packit c4476c
	.quad	0x800000000000808a
Packit c4476c
	.quad	0x8000000080008000
Packit c4476c
	.quad	0x000000000000808b
Packit c4476c
	.quad	0x0000000080000001
Packit c4476c
	.quad	0x8000000080008081
Packit c4476c
	.quad	0x8000000000008009
Packit c4476c
	.quad	0x000000000000008a
Packit c4476c
	.quad	0x0000000000000088
Packit c4476c
	.quad	0x0000000080008009
Packit c4476c
	.quad	0x000000008000000a
Packit c4476c
	.quad	0x000000008000808b
Packit c4476c
	.quad	0x800000000000008b
Packit c4476c
	.quad	0x8000000000008089
Packit c4476c
	.quad	0x8000000000008003
Packit c4476c
	.quad	0x8000000000008002
Packit c4476c
	.quad	0x8000000000000080
Packit c4476c
	.quad	0x000000000000800a
Packit c4476c
	.quad	0x800000008000000a
Packit c4476c
	.quad	0x8000000080008081
Packit c4476c
	.quad	0x8000000000008080
Packit c4476c
	.quad	0x0000000080000001
Packit c4476c
	.quad	0x8000000080008008
Packit c4476c
.size	iotas,.-iotas
Packit c4476c
.asciz	"Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
___
Packit c4476c
Packit c4476c
$code =~ s/\`([^\`]*)\`/eval $1/gem;
Packit c4476c
print $code;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";