Blame crypto/chacha/asm/chacha-s390x.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# December 2015
Packit c4476c
#
Packit c4476c
# ChaCha20 for s390x.
Packit c4476c
#
Packit c4476c
# 3 times faster than compiler-generated code.
Packit c4476c
Packit c4476c
#
Packit c4476c
# August 2018
Packit c4476c
#
Packit c4476c
# Add vx code path: 4x"vertical".
Packit c4476c
#
Packit c4476c
# Copyright IBM Corp. 2018
Packit c4476c
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
Packit c4476c
Packit c4476c
#
Packit c4476c
# February 2019
Packit c4476c
#
Packit c4476c
# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
Packit c4476c
# 4x"vertical" submission [on z13] and >3 faster than scalar code.
Packit c4476c
# But to harness overheads revert to transliteration of VSX code path
Packit c4476c
# from chacha-ppc module, which is also 4x"vertical", to handle inputs
Packit c4476c
# not longer than 256 bytes.
Packit c4476c
Packit c4476c
use strict;
Packit c4476c
use FindBin qw($Bin);
Packit c4476c
use lib "$Bin/../..";
Packit c4476c
use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
Packit c4476c
Packit c4476c
my $flavour = shift;
Packit c4476c
Packit c4476c
my ($z,$SIZE_T);
Packit c4476c
if ($flavour =~ /3[12]/) {
Packit c4476c
	$z=0;	# S/390 ABI
Packit c4476c
	$SIZE_T=4;
Packit c4476c
} else {
Packit c4476c
	$z=1;	# zSeries ABI
Packit c4476c
	$SIZE_T=8;
Packit c4476c
}
Packit c4476c
Packit c4476c
my $output;
Packit c4476c
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
Packit c4476c
Packit c4476c
my $sp="%r15";
Packit c4476c
my $stdframe=16*$SIZE_T+4*8;
Packit c4476c
Packit c4476c
sub ROUND {
Packit c4476c
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
Packit c4476c
my @t=map("%r$_",(8,9));
Packit c4476c
my ($a0,$b0,$c0,$d0)=@_;
Packit c4476c
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
Packit c4476c
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
Packit c4476c
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
Packit c4476c
my ($xc,$xc_)=map("$_",@t);
Packit c4476c
Packit c4476c
	# Consider order in which variables are addressed by their
Packit c4476c
	# index:
Packit c4476c
	#
Packit c4476c
	#	a   b   c   d
Packit c4476c
	#
Packit c4476c
	#	0   4   8  12 < even round
Packit c4476c
	#	1   5   9  13
Packit c4476c
	#	2   6  10  14
Packit c4476c
	#	3   7  11  15
Packit c4476c
	#	0   5  10  15 < odd round
Packit c4476c
	#	1   6  11  12
Packit c4476c
	#	2   7   8  13
Packit c4476c
	#	3   4   9  14
Packit c4476c
	#
Packit c4476c
	# 'a', 'b' and 'd's are permanently allocated in registers,
Packit c4476c
	# @x[0..7,12..15], while 'c's are maintained in memory. If
Packit c4476c
	# you observe 'c' column, you'll notice that pair of 'c's is
Packit c4476c
	# invariant between rounds. This means that we have to reload
Packit c4476c
	# them once per round, in the middle. This is why you'll see
Packit c4476c
	# 'c' stores and loads in the middle, but none in the beginning
Packit c4476c
	# or end.
Packit c4476c
Packit c4476c
	alr	(@x[$a0],@x[$b0]);	# Q1
Packit c4476c
	 alr	(@x[$a1],@x[$b1]);	# Q2
Packit c4476c
	xr	(@x[$d0],@x[$a0]);
Packit c4476c
	 xr	(@x[$d1],@x[$a1]);
Packit c4476c
	rll	(@x[$d0],@x[$d0],16);
Packit c4476c
	 rll	(@x[$d1],@x[$d1],16);
Packit c4476c
Packit c4476c
	alr	($xc,@x[$d0]);
Packit c4476c
	 alr	($xc_,@x[$d1]);
Packit c4476c
	xr	(@x[$b0],$xc);
Packit c4476c
	 xr	(@x[$b1],$xc_);
Packit c4476c
	rll	(@x[$b0],@x[$b0],12);
Packit c4476c
	 rll	(@x[$b1],@x[$b1],12);
Packit c4476c
Packit c4476c
	alr	(@x[$a0],@x[$b0]);
Packit c4476c
	 alr	(@x[$a1],@x[$b1]);
Packit c4476c
	xr	(@x[$d0],@x[$a0]);
Packit c4476c
	 xr	(@x[$d1],@x[$a1]);
Packit c4476c
	rll	(@x[$d0],@x[$d0],8);
Packit c4476c
	 rll	(@x[$d1],@x[$d1],8);
Packit c4476c
Packit c4476c
	alr	($xc,@x[$d0]);
Packit c4476c
	 alr	($xc_,@x[$d1]);
Packit c4476c
	xr	(@x[$b0],$xc);
Packit c4476c
	 xr	(@x[$b1],$xc_);
Packit c4476c
	rll	(@x[$b0],@x[$b0],7);
Packit c4476c
	 rll	(@x[$b1],@x[$b1],7);
Packit c4476c
Packit c4476c
	stm	($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");	# reload pair of 'c's
Packit c4476c
	lm	($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
Packit c4476c
Packit c4476c
	alr	(@x[$a2],@x[$b2]);	# Q3
Packit c4476c
	 alr	(@x[$a3],@x[$b3]);	# Q4
Packit c4476c
	xr	(@x[$d2],@x[$a2]);
Packit c4476c
	 xr	(@x[$d3],@x[$a3]);
Packit c4476c
	rll	(@x[$d2],@x[$d2],16);
Packit c4476c
	 rll	(@x[$d3],@x[$d3],16);
Packit c4476c
Packit c4476c
	alr	($xc,@x[$d2]);
Packit c4476c
	 alr	($xc_,@x[$d3]);
Packit c4476c
	xr	(@x[$b2],$xc);
Packit c4476c
	 xr	(@x[$b3],$xc_);
Packit c4476c
	rll	(@x[$b2],@x[$b2],12);
Packit c4476c
	 rll	(@x[$b3],@x[$b3],12);
Packit c4476c
Packit c4476c
	alr	(@x[$a2],@x[$b2]);
Packit c4476c
	 alr	(@x[$a3],@x[$b3]);
Packit c4476c
	xr	(@x[$d2],@x[$a2]);
Packit c4476c
	 xr	(@x[$d3],@x[$a3]);
Packit c4476c
	rll	(@x[$d2],@x[$d2],8);
Packit c4476c
	 rll	(@x[$d3],@x[$d3],8);
Packit c4476c
Packit c4476c
	alr	($xc,@x[$d2]);
Packit c4476c
	 alr	($xc_,@x[$d3]);
Packit c4476c
	xr	(@x[$b2],$xc);
Packit c4476c
	 xr	(@x[$b3],$xc_);
Packit c4476c
	rll	(@x[$b2],@x[$b2],7);
Packit c4476c
	 rll	(@x[$b3],@x[$b3],7);
Packit c4476c
}
Packit c4476c
Packit c4476c
sub VX_lane_ROUND {
Packit c4476c
my ($a0,$b0,$c0,$d0)=@_;
Packit c4476c
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
Packit c4476c
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
Packit c4476c
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
Packit c4476c
my @x=map("%v$_",(0..15));
Packit c4476c
Packit c4476c
	vaf	(@x[$a0],@x[$a0],@x[$b0]);	# Q1
Packit c4476c
	vx	(@x[$d0],@x[$d0],@x[$a0]);
Packit c4476c
	verllf	(@x[$d0],@x[$d0],16);
Packit c4476c
	vaf	(@x[$a1],@x[$a1],@x[$b1]);	# Q2
Packit c4476c
	vx	(@x[$d1],@x[$d1],@x[$a1]);
Packit c4476c
	verllf	(@x[$d1],@x[$d1],16);
Packit c4476c
	vaf	(@x[$a2],@x[$a2],@x[$b2]);	# Q3
Packit c4476c
	vx	(@x[$d2],@x[$d2],@x[$a2]);
Packit c4476c
	verllf	(@x[$d2],@x[$d2],16);
Packit c4476c
	vaf	(@x[$a3],@x[$a3],@x[$b3]);	# Q4
Packit c4476c
	vx	(@x[$d3],@x[$d3],@x[$a3]);
Packit c4476c
	verllf	(@x[$d3],@x[$d3],16);
Packit c4476c
Packit c4476c
	vaf	(@x[$c0],@x[$c0],@x[$d0]);
Packit c4476c
	vx	(@x[$b0],@x[$b0],@x[$c0]);
Packit c4476c
	verllf	(@x[$b0],@x[$b0],12);
Packit c4476c
	vaf	(@x[$c1],@x[$c1],@x[$d1]);
Packit c4476c
	vx	(@x[$b1],@x[$b1],@x[$c1]);
Packit c4476c
	verllf	(@x[$b1],@x[$b1],12);
Packit c4476c
	vaf	(@x[$c2],@x[$c2],@x[$d2]);
Packit c4476c
	vx	(@x[$b2],@x[$b2],@x[$c2]);
Packit c4476c
	verllf	(@x[$b2],@x[$b2],12);
Packit c4476c
	vaf	(@x[$c3],@x[$c3],@x[$d3]);
Packit c4476c
	vx	(@x[$b3],@x[$b3],@x[$c3]);
Packit c4476c
	verllf	(@x[$b3],@x[$b3],12);
Packit c4476c
Packit c4476c
	vaf	(@x[$a0],@x[$a0],@x[$b0]);
Packit c4476c
	vx	(@x[$d0],@x[$d0],@x[$a0]);
Packit c4476c
	verllf	(@x[$d0],@x[$d0],8);
Packit c4476c
	vaf	(@x[$a1],@x[$a1],@x[$b1]);
Packit c4476c
	vx	(@x[$d1],@x[$d1],@x[$a1]);
Packit c4476c
	verllf	(@x[$d1],@x[$d1],8);
Packit c4476c
	vaf	(@x[$a2],@x[$a2],@x[$b2]);
Packit c4476c
	vx	(@x[$d2],@x[$d2],@x[$a2]);
Packit c4476c
	verllf	(@x[$d2],@x[$d2],8);
Packit c4476c
	vaf	(@x[$a3],@x[$a3],@x[$b3]);
Packit c4476c
	vx	(@x[$d3],@x[$d3],@x[$a3]);
Packit c4476c
	verllf	(@x[$d3],@x[$d3],8);
Packit c4476c
Packit c4476c
	vaf	(@x[$c0],@x[$c0],@x[$d0]);
Packit c4476c
	vx	(@x[$b0],@x[$b0],@x[$c0]);
Packit c4476c
	verllf	(@x[$b0],@x[$b0],7);
Packit c4476c
	vaf	(@x[$c1],@x[$c1],@x[$d1]);
Packit c4476c
	vx	(@x[$b1],@x[$b1],@x[$c1]);
Packit c4476c
	verllf	(@x[$b1],@x[$b1],7);
Packit c4476c
	vaf	(@x[$c2],@x[$c2],@x[$d2]);
Packit c4476c
	vx	(@x[$b2],@x[$b2],@x[$c2]);
Packit c4476c
	verllf	(@x[$b2],@x[$b2],7);
Packit c4476c
	vaf	(@x[$c3],@x[$c3],@x[$d3]);
Packit c4476c
	vx	(@x[$b3],@x[$b3],@x[$c3]);
Packit c4476c
	verllf	(@x[$b3],@x[$b3],7);
Packit c4476c
}
Packit c4476c
Packit c4476c
sub VX_ROUND {
Packit c4476c
my @a=@_[0..5];
Packit c4476c
my @b=@_[6..11];
Packit c4476c
my @c=@_[12..17];
Packit c4476c
my @d=@_[18..23];
Packit c4476c
my $odd=@_[24];
Packit c4476c
Packit c4476c
	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
Packit c4476c
	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
Packit c4476c
	verllf		(@d[$_],@d[$_],16) for (0..5);
Packit c4476c
Packit c4476c
	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
Packit c4476c
	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
Packit c4476c
	verllf		(@b[$_],@b[$_],12) for (0..5);
Packit c4476c
Packit c4476c
	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
Packit c4476c
	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
Packit c4476c
	verllf		(@d[$_],@d[$_],8) for (0..5);
Packit c4476c
Packit c4476c
	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
Packit c4476c
	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
Packit c4476c
	verllf		(@b[$_],@b[$_],7) for (0..5);
Packit c4476c
Packit c4476c
	vsldb		(@c[$_],@c[$_],@c[$_],8) for (0..5);
Packit c4476c
	vsldb		(@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
Packit c4476c
	vsldb		(@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
Packit c4476c
}
Packit c4476c
Packit c4476c
PERLASM_BEGIN($output);
Packit c4476c
Packit c4476c
INCLUDE	("s390x_arch.h");
Packit c4476c
TEXT	();
Packit c4476c
Packit c4476c
################
Packit c4476c
# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
Packit c4476c
#                     const unsigned int key[8], const unsigned int counter[4])
Packit c4476c
my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
Packit c4476c
{
Packit c4476c
my $frame=$stdframe+4*20;
Packit c4476c
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
Packit c4476c
my @t=map("%r$_",(8,9));
Packit c4476c
Packit c4476c
GLOBL	("ChaCha20_ctr32");
Packit c4476c
TYPE	("ChaCha20_ctr32","\@function");
Packit c4476c
ALIGN	(32);
Packit c4476c
LABEL	("ChaCha20_ctr32");
Packit c4476c
	larl	("%r1","OPENSSL_s390xcap_P");
Packit c4476c
Packit c4476c
	lghi	("%r0",64);
Packit c4476c
&{$z?	\&ltgr:\&ltr}	($len,$len);		# len==0?
Packit c4476c
	bzr	("%r14");
Packit c4476c
	lg	("%r1","S390X_STFLE+16(%r1)");
Packit c4476c
&{$z?	\&clgr:\&clr}	($len,"%r0");
Packit c4476c
	jle	(".Lshort");
Packit c4476c
Packit c4476c
	tmhh	("%r1",0x4000);			# check for vx bit
Packit c4476c
	jnz	(".LChaCha20_ctr32_vx");
Packit c4476c
Packit c4476c
LABEL	(".Lshort");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-64);
Packit c4476c
&{$z?	\&lghi:\&lhi}	("%r1",-$frame);
Packit c4476c
&{$z?	\&stmg:\&stm}	("%r6","%r15","6*$SIZE_T($sp)");
Packit c4476c
&{$z?	\&slgr:\&slr}	($out,$inp);	# difference
Packit c4476c
	la	($len,"0($inp,$len)");	# end of input minus 64
Packit c4476c
	larl	("%r7",".Lsigma");
Packit c4476c
	lgr	("%r0",$sp);
Packit c4476c
	la	($sp,"0(%r1,$sp)");
Packit c4476c
&{$z?	\&stg:\&st}	("%r0","0($sp)");
Packit c4476c
Packit c4476c
	lmg	("%r8","%r11","0($key)");	# load key
Packit c4476c
	lmg	("%r12","%r13","0($counter)");	# load counter
Packit c4476c
	lmg	("%r6","%r7","0(%r7)");	# load sigma constant
Packit c4476c
Packit c4476c
	la	("%r14","0($inp)");
Packit c4476c
&{$z?	\&stg:\&st}	($out,"$frame+3*$SIZE_T($sp)");
Packit c4476c
&{$z?	\&stg:\&st}	($len,"$frame+4*$SIZE_T($sp)");
Packit c4476c
	stmg	("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
Packit c4476c
	srlg	(@x[12],"%r12",32);	# 32-bit counter value
Packit c4476c
	j	(".Loop_outer");
Packit c4476c
Packit c4476c
ALIGN	(16);
Packit c4476c
LABEL	(".Loop_outer");
Packit c4476c
	lm	(@x[0],@x[7],"$stdframe+4*0($sp)");	# load x[0]-x[7]
Packit c4476c
	lm	(@t[0],@t[1],"$stdframe+4*10($sp)");	# load x[10]-x[11]
Packit c4476c
	lm	(@x[13],@x[15],"$stdframe+4*13($sp)");	# load x[13]-x[15]
Packit c4476c
	stm	(@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
Packit c4476c
	lm	(@t[0],@t[1],"$stdframe+4*8($sp)");	# load x[8]-x[9]
Packit c4476c
	st	(@x[12],"$stdframe+4*12($sp)");	# save counter
Packit c4476c
&{$z?	\&stg:\&st}	("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
Packit c4476c
	lhi	("%r14",10);
Packit c4476c
	j	(".Loop");
Packit c4476c
Packit c4476c
ALIGN	(4);
Packit c4476c
LABEL	(".Loop");
Packit c4476c
	ROUND	(0, 4, 8,12);
Packit c4476c
	ROUND	(0, 5,10,15);
Packit c4476c
	brct	("%r14",".Loop");
Packit c4476c
Packit c4476c
&{$z?	\&lg:\&l}	("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
Packit c4476c
	stm	(@t[0],@t[1],"$stdframe+4*8+4*8($sp)");	# offload x[8]-x[9]
Packit c4476c
&{$z?	\&lmg:\&lm}	(@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
Packit c4476c
Packit c4476c
	al	(@x[0],"$stdframe+4*0($sp)");	# accumulate key schedule
Packit c4476c
	al	(@x[1],"$stdframe+4*1($sp)");
Packit c4476c
	al	(@x[2],"$stdframe+4*2($sp)");
Packit c4476c
	al	(@x[3],"$stdframe+4*3($sp)");
Packit c4476c
	al	(@x[4],"$stdframe+4*4($sp)");
Packit c4476c
	al	(@x[5],"$stdframe+4*5($sp)");
Packit c4476c
	al	(@x[6],"$stdframe+4*6($sp)");
Packit c4476c
	al	(@x[7],"$stdframe+4*7($sp)");
Packit c4476c
	lrvr	(@x[0],@x[0]);
Packit c4476c
	lrvr	(@x[1],@x[1]);
Packit c4476c
	lrvr	(@x[2],@x[2]);
Packit c4476c
	lrvr	(@x[3],@x[3]);
Packit c4476c
	lrvr	(@x[4],@x[4]);
Packit c4476c
	lrvr	(@x[5],@x[5]);
Packit c4476c
	lrvr	(@x[6],@x[6]);
Packit c4476c
	lrvr	(@x[7],@x[7]);
Packit c4476c
	al	(@x[12],"$stdframe+4*12($sp)");
Packit c4476c
	al	(@x[13],"$stdframe+4*13($sp)");
Packit c4476c
	al	(@x[14],"$stdframe+4*14($sp)");
Packit c4476c
	al	(@x[15],"$stdframe+4*15($sp)");
Packit c4476c
	lrvr	(@x[12],@x[12]);
Packit c4476c
	lrvr	(@x[13],@x[13]);
Packit c4476c
	lrvr	(@x[14],@x[14]);
Packit c4476c
	lrvr	(@x[15],@x[15]);
Packit c4476c
Packit c4476c
	la	(@t[0],"0(@t[0],%r14)");	# reconstruct output pointer
Packit c4476c
&{$z?	\&clgr:\&clr}	("%r14",@t[1]);
Packit c4476c
	jh	(".Ltail");
Packit c4476c
Packit c4476c
	x	(@x[0],"4*0(%r14)");	# xor with input
Packit c4476c
	x	(@x[1],"4*1(%r14)");
Packit c4476c
	st	(@x[0],"4*0(@t[0])");	# store output
Packit c4476c
	x	(@x[2],"4*2(%r14)");
Packit c4476c
	st	(@x[1],"4*1(@t[0])");
Packit c4476c
	x	(@x[3],"4*3(%r14)");
Packit c4476c
	st	(@x[2],"4*2(@t[0])");
Packit c4476c
	x	(@x[4],"4*4(%r14)");
Packit c4476c
	st	(@x[3],"4*3(@t[0])");
Packit c4476c
	 lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");	# load x[8]-x[11]
Packit c4476c
	x	(@x[5],"4*5(%r14)");
Packit c4476c
	st	(@x[4],"4*4(@t[0])");
Packit c4476c
	x	(@x[6],"4*6(%r14)");
Packit c4476c
	 al	(@x[0],"$stdframe+4*8($sp)");
Packit c4476c
	st	(@x[5],"4*5(@t[0])");
Packit c4476c
	x	(@x[7],"4*7(%r14)");
Packit c4476c
	 al	(@x[1],"$stdframe+4*9($sp)");
Packit c4476c
	st	(@x[6],"4*6(@t[0])");
Packit c4476c
	x	(@x[12],"4*12(%r14)");
Packit c4476c
	 al	(@x[2],"$stdframe+4*10($sp)");
Packit c4476c
	st	(@x[7],"4*7(@t[0])");
Packit c4476c
	x	(@x[13],"4*13(%r14)");
Packit c4476c
	 al	(@x[3],"$stdframe+4*11($sp)");
Packit c4476c
	st	(@x[12],"4*12(@t[0])");
Packit c4476c
	x	(@x[14],"4*14(%r14)");
Packit c4476c
	st	(@x[13],"4*13(@t[0])");
Packit c4476c
	x	(@x[15],"4*15(%r14)");
Packit c4476c
	st	(@x[14],"4*14(@t[0])");
Packit c4476c
	 lrvr	(@x[0],@x[0]);
Packit c4476c
	st	(@x[15],"4*15(@t[0])");
Packit c4476c
	 lrvr	(@x[1],@x[1]);
Packit c4476c
	 lrvr	(@x[2],@x[2]);
Packit c4476c
	 lrvr	(@x[3],@x[3]);
Packit c4476c
	lhi	(@x[12],1);
Packit c4476c
	 x	(@x[0],"4*8(%r14)");
Packit c4476c
	al	(@x[12],"$stdframe+4*12($sp)");	# increment counter
Packit c4476c
	 x	(@x[1],"4*9(%r14)");
Packit c4476c
	 st	(@x[0],"4*8(@t[0])");
Packit c4476c
	 x	(@x[2],"4*10(%r14)");
Packit c4476c
	 st	(@x[1],"4*9(@t[0])");
Packit c4476c
	 x	(@x[3],"4*11(%r14)");
Packit c4476c
	 st	(@x[2],"4*10(@t[0])");
Packit c4476c
	 st	(@x[3],"4*11(@t[0])");
Packit c4476c
Packit c4476c
&{$z?	\&clgr:\&clr}	("%r14",@t[1]);	# done yet?
Packit c4476c
	la	("%r14","64(%r14)");
Packit c4476c
	jl	(".Loop_outer");
Packit c4476c
Packit c4476c
LABEL	(".Ldone");
Packit c4476c
	xgr	("%r0","%r0");
Packit c4476c
	xgr	("%r1","%r1");
Packit c4476c
	xgr	("%r2","%r2");
Packit c4476c
	xgr	("%r3","%r3");
Packit c4476c
	stmg	("%r0","%r3","$stdframe+4*4($sp)");	# wipe key copy
Packit c4476c
	stmg	("%r0","%r3","$stdframe+4*12($sp)");
Packit c4476c
Packit c4476c
&{$z?	\&lmg:\&lm}	("%r6","%r15","$frame+6*$SIZE_T($sp)");
Packit c4476c
	br	("%r14");
Packit c4476c
Packit c4476c
ALIGN	(16);
Packit c4476c
LABEL	(".Ltail");
Packit c4476c
	la	(@t[1],"64($t[1])");
Packit c4476c
	stm	(@x[0],@x[7],"$stdframe+4*0($sp)");
Packit c4476c
&{$z?	\&slgr:\&slr}	(@t[1],"%r14");
Packit c4476c
	lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
Packit c4476c
&{$z?	\&lghi:\&lhi}	(@x[6],0);
Packit c4476c
	stm	(@x[12],@x[15],"$stdframe+4*12($sp)");
Packit c4476c
	al	(@x[0],"$stdframe+4*8($sp)");
Packit c4476c
	al	(@x[1],"$stdframe+4*9($sp)");
Packit c4476c
	al	(@x[2],"$stdframe+4*10($sp)");
Packit c4476c
	al	(@x[3],"$stdframe+4*11($sp)");
Packit c4476c
	lrvr	(@x[0],@x[0]);
Packit c4476c
	lrvr	(@x[1],@x[1]);
Packit c4476c
	lrvr	(@x[2],@x[2]);
Packit c4476c
	lrvr	(@x[3],@x[3]);
Packit c4476c
	stm	(@x[0],@x[3],"$stdframe+4*8($sp)");
Packit c4476c
Packit c4476c
LABEL	(".Loop_tail");
Packit c4476c
	llgc	(@x[4],"0(@x[6],%r14)");
Packit c4476c
	llgc	(@x[5],"$stdframe(@x[6],$sp)");
Packit c4476c
	xr	(@x[5],@x[4]);
Packit c4476c
	stc	(@x[5],"0(@x[6],@t[0])");
Packit c4476c
	la	(@x[6],"1(@x[6])");
Packit c4476c
	brct	(@t[1],".Loop_tail");
Packit c4476c
Packit c4476c
	j	(".Ldone");
Packit c4476c
SIZE	("ChaCha20_ctr32",".-ChaCha20_ctr32");
Packit c4476c
}
Packit c4476c
Packit c4476c
########################################################################
Packit c4476c
# 4x"vertical" layout minimizes amount of instructions, but pipeline
Packit c4476c
# runs underutilized [because of vector instructions' high latency].
Packit c4476c
# On the other hand minimum amount of data it takes to fully utilize
Packit c4476c
# the pipeline is higher, so that effectively, short inputs would be
Packit c4476c
# processed slower. Hence this code path targeting <=256 bytes lengths.
Packit c4476c
#
Packit c4476c
{
Packit c4476c
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
Packit c4476c
    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
Packit c4476c
my @K=map("%v$_",(16..19));
Packit c4476c
my $CTR="%v26";
Packit c4476c
my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
Packit c4476c
my $beperm="%v31";
Packit c4476c
my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
Packit c4476c
my $FRAME=$stdframe+4*16;
Packit c4476c
Packit c4476c
ALIGN	(32);
Packit c4476c
LABEL	("ChaCha20_ctr32_4x");
Packit c4476c
LABEL	(".LChaCha20_ctr32_4x");
Packit c4476c
&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
Packit c4476c
if (!$z) {
Packit c4476c
	std	("%f4","16*$SIZE_T+2*8($sp)");
Packit c4476c
	std	("%f6","16*$SIZE_T+3*8($sp)");
Packit c4476c
}
Packit c4476c
&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
Packit c4476c
	lgr	("%r0",$sp);
Packit c4476c
	la	($sp,"0(%r1,$sp)");
Packit c4476c
&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
Packit c4476c
if ($z) {
Packit c4476c
	std	("%f8","$stdframe+8*0($sp)");
Packit c4476c
	std	("%f9","$stdframe+8*1($sp)");
Packit c4476c
	std	("%f10","$stdframe+8*2($sp)");
Packit c4476c
	std	("%f11","$stdframe+8*3($sp)");
Packit c4476c
	std	("%f12","$stdframe+8*4($sp)");
Packit c4476c
	std	("%f13","$stdframe+8*5($sp)");
Packit c4476c
	std	("%f14","$stdframe+8*6($sp)");
Packit c4476c
	std	("%f15","$stdframe+8*7($sp)");
Packit c4476c
}
Packit c4476c
	larl	("%r7",".Lsigma");
Packit c4476c
	lhi	("%r0",10);
Packit c4476c
	lhi	("%r1",0);
Packit c4476c
Packit c4476c
	vl	(@K[0],"0(%r7)");		# load sigma
Packit c4476c
	vl	(@K[1],"0($key)");		# load key
Packit c4476c
	vl	(@K[2],"16($key)");
Packit c4476c
	vl	(@K[3],"0($counter)");		# load counter
Packit c4476c
Packit c4476c
	vl	($beperm,"0x40(%r7)");
Packit c4476c
	vl	($xt1,"0x50(%r7)");
Packit c4476c
	vrepf	($CTR,@K[3],0);
Packit c4476c
	vlvgf	(@K[3],"%r1",0);		# clear @K[3].word[0]
Packit c4476c
	vaf	($CTR,$CTR,$xt1);
Packit c4476c
Packit c4476c
#LABEL	(".Loop_outer_4x");
Packit c4476c
	vlm	($xa0,$xa3,"0x60(%r7)");	# load [smashed] sigma
Packit c4476c
Packit c4476c
	vrepf	($xb0,@K[1],0);			# smash the key
Packit c4476c
	vrepf	($xb1,@K[1],1);
Packit c4476c
	vrepf	($xb2,@K[1],2);
Packit c4476c
	vrepf	($xb3,@K[1],3);
Packit c4476c
Packit c4476c
	vrepf	($xc0,@K[2],0);
Packit c4476c
	vrepf	($xc1,@K[2],1);
Packit c4476c
	vrepf	($xc2,@K[2],2);
Packit c4476c
	vrepf	($xc3,@K[2],3);
Packit c4476c
Packit c4476c
	vlr	($xd0,$CTR);
Packit c4476c
	vrepf	($xd1,@K[3],1);
Packit c4476c
	vrepf	($xd2,@K[3],2);
Packit c4476c
	vrepf	($xd3,@K[3],3);
Packit c4476c
Packit c4476c
LABEL	(".Loop_4x");
Packit c4476c
	VX_lane_ROUND(0, 4, 8,12);
Packit c4476c
	VX_lane_ROUND(0, 5,10,15);
Packit c4476c
	brct	("%r0",".Loop_4x");
Packit c4476c
Packit c4476c
	vaf	($xd0,$xd0,$CTR);
Packit c4476c
Packit c4476c
	vmrhf	($xt0,$xa0,$xa1);		# transpose data
Packit c4476c
	vmrhf	($xt1,$xa2,$xa3);
Packit c4476c
	vmrlf	($xt2,$xa0,$xa1);
Packit c4476c
	vmrlf	($xt3,$xa2,$xa3);
Packit c4476c
	vpdi	($xa0,$xt0,$xt1,0b0000);
Packit c4476c
	vpdi	($xa1,$xt0,$xt1,0b0101);
Packit c4476c
	vpdi	($xa2,$xt2,$xt3,0b0000);
Packit c4476c
	vpdi	($xa3,$xt2,$xt3,0b0101);
Packit c4476c
Packit c4476c
	vmrhf	($xt0,$xb0,$xb1);
Packit c4476c
	vmrhf	($xt1,$xb2,$xb3);
Packit c4476c
	vmrlf	($xt2,$xb0,$xb1);
Packit c4476c
	vmrlf	($xt3,$xb2,$xb3);
Packit c4476c
	vpdi	($xb0,$xt0,$xt1,0b0000);
Packit c4476c
	vpdi	($xb1,$xt0,$xt1,0b0101);
Packit c4476c
	vpdi	($xb2,$xt2,$xt3,0b0000);
Packit c4476c
	vpdi	($xb3,$xt2,$xt3,0b0101);
Packit c4476c
Packit c4476c
	vmrhf	($xt0,$xc0,$xc1);
Packit c4476c
	vmrhf	($xt1,$xc2,$xc3);
Packit c4476c
	vmrlf	($xt2,$xc0,$xc1);
Packit c4476c
	vmrlf	($xt3,$xc2,$xc3);
Packit c4476c
	vpdi	($xc0,$xt0,$xt1,0b0000);
Packit c4476c
	vpdi	($xc1,$xt0,$xt1,0b0101);
Packit c4476c
	vpdi	($xc2,$xt2,$xt3,0b0000);
Packit c4476c
	vpdi	($xc3,$xt2,$xt3,0b0101);
Packit c4476c
Packit c4476c
	vmrhf	($xt0,$xd0,$xd1);
Packit c4476c
	vmrhf	($xt1,$xd2,$xd3);
Packit c4476c
	vmrlf	($xt2,$xd0,$xd1);
Packit c4476c
	vmrlf	($xt3,$xd2,$xd3);
Packit c4476c
	vpdi	($xd0,$xt0,$xt1,0b0000);
Packit c4476c
	vpdi	($xd1,$xt0,$xt1,0b0101);
Packit c4476c
	vpdi	($xd2,$xt2,$xt3,0b0000);
Packit c4476c
	vpdi	($xd3,$xt2,$xt3,0b0101);
Packit c4476c
Packit c4476c
	#vrepif	($xt0,4);
Packit c4476c
	#vaf	($CTR,$CTR,$xt0);		# next counter value
Packit c4476c
Packit c4476c
	vaf	($xa0,$xa0,@K[0]);
Packit c4476c
	vaf	($xb0,$xb0,@K[1]);
Packit c4476c
	vaf	($xc0,$xc0,@K[2]);
Packit c4476c
	vaf	($xd0,$xd0,@K[3]);
Packit c4476c
Packit c4476c
	vperm	($xa0,$xa0,$xa0,$beperm);
Packit c4476c
	vperm	($xb0,$xb0,$xb0,$beperm);
Packit c4476c
	vperm	($xc0,$xc0,$xc0,$beperm);
Packit c4476c
	vperm	($xd0,$xd0,$xd0,$beperm);
Packit c4476c
Packit c4476c
	#&{$z?	\&clgfi:\&clfi} ($len,0x40);
Packit c4476c
	#jl	(".Ltail_4x");
Packit c4476c
Packit c4476c
	vlm	($xt0,$xt3,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($xt0,$xt0,$xa0);
Packit c4476c
	vx	($xt1,$xt1,$xb0);
Packit c4476c
	vx	($xt2,$xt2,$xc0);
Packit c4476c
	vx	($xt3,$xt3,$xd0);
Packit c4476c
Packit c4476c
	vstm	($xt0,$xt3,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	#je	(".Ldone_4x");
Packit c4476c
Packit c4476c
	vaf	($xa0,$xa1,@K[0]);
Packit c4476c
	vaf	($xb0,$xb1,@K[1]);
Packit c4476c
	vaf	($xc0,$xc1,@K[2]);
Packit c4476c
	vaf	($xd0,$xd1,@K[3]);
Packit c4476c
Packit c4476c
	vperm	($xa0,$xa0,$xa0,$beperm);
Packit c4476c
	vperm	($xb0,$xb0,$xb0,$beperm);
Packit c4476c
	vperm	($xc0,$xc0,$xc0,$beperm);
Packit c4476c
	vperm	($xd0,$xd0,$xd0,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi} ($len,0x40);
Packit c4476c
	jl	(".Ltail_4x");
Packit c4476c
Packit c4476c
	vlm	($xt0,$xt3,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($xt0,$xt0,$xa0);
Packit c4476c
	vx	($xt1,$xt1,$xb0);
Packit c4476c
	vx	($xt2,$xt2,$xc0);
Packit c4476c
	vx	($xt3,$xt3,$xd0);
Packit c4476c
Packit c4476c
	vstm	($xt0,$xt3,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_4x");
Packit c4476c
Packit c4476c
	vaf	($xa0,$xa2,@K[0]);
Packit c4476c
	vaf	($xb0,$xb2,@K[1]);
Packit c4476c
	vaf	($xc0,$xc2,@K[2]);
Packit c4476c
	vaf	($xd0,$xd2,@K[3]);
Packit c4476c
Packit c4476c
	vperm	($xa0,$xa0,$xa0,$beperm);
Packit c4476c
	vperm	($xb0,$xb0,$xb0,$beperm);
Packit c4476c
	vperm	($xc0,$xc0,$xc0,$beperm);
Packit c4476c
	vperm	($xd0,$xd0,$xd0,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi} ($len,0x40);
Packit c4476c
	jl	(".Ltail_4x");
Packit c4476c
Packit c4476c
	vlm	($xt0,$xt3,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($xt0,$xt0,$xa0);
Packit c4476c
	vx	($xt1,$xt1,$xb0);
Packit c4476c
	vx	($xt2,$xt2,$xc0);
Packit c4476c
	vx	($xt3,$xt3,$xd0);
Packit c4476c
Packit c4476c
	vstm	($xt0,$xt3,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_4x");
Packit c4476c
Packit c4476c
	vaf	($xa0,$xa3,@K[0]);
Packit c4476c
	vaf	($xb0,$xb3,@K[1]);
Packit c4476c
	vaf	($xc0,$xc3,@K[2]);
Packit c4476c
	vaf	($xd0,$xd3,@K[3]);
Packit c4476c
Packit c4476c
	vperm	($xa0,$xa0,$xa0,$beperm);
Packit c4476c
	vperm	($xb0,$xb0,$xb0,$beperm);
Packit c4476c
	vperm	($xc0,$xc0,$xc0,$beperm);
Packit c4476c
	vperm	($xd0,$xd0,$xd0,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi} ($len,0x40);
Packit c4476c
	jl	(".Ltail_4x");
Packit c4476c
Packit c4476c
	vlm	($xt0,$xt3,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($xt0,$xt0,$xa0);
Packit c4476c
	vx	($xt1,$xt1,$xb0);
Packit c4476c
	vx	($xt2,$xt2,$xc0);
Packit c4476c
	vx	($xt3,$xt3,$xd0);
Packit c4476c
Packit c4476c
	vstm	($xt0,$xt3,"0($out)");
Packit c4476c
Packit c4476c
	#la	$inp,0x40($inp));
Packit c4476c
	#la	$out,0x40($out));
Packit c4476c
	#lhi	%r0,10);
Packit c4476c
	#&{$z?	\&aghi:\&ahi}	$len,-0x40);
Packit c4476c
	#jne	.Loop_outer_4x);
Packit c4476c
Packit c4476c
LABEL	(".Ldone_4x");
Packit c4476c
if (!$z) {
Packit c4476c
	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
Packit c4476c
	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
Packit c4476c
} else {
Packit c4476c
	ld	("%f8","$stdframe+8*0($sp)");
Packit c4476c
	ld	("%f9","$stdframe+8*1($sp)");
Packit c4476c
	ld	("%f10","$stdframe+8*2($sp)");
Packit c4476c
	ld	("%f11","$stdframe+8*3($sp)");
Packit c4476c
	ld	("%f12","$stdframe+8*4($sp)");
Packit c4476c
	ld	("%f13","$stdframe+8*5($sp)");
Packit c4476c
	ld	("%f14","$stdframe+8*6($sp)");
Packit c4476c
	ld	("%f15","$stdframe+8*7($sp)");
Packit c4476c
}
Packit c4476c
&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
Packit c4476c
	la	($sp,"$FRAME($sp)");
Packit c4476c
	br	("%r14");
Packit c4476c
Packit c4476c
ALIGN	(16);
Packit c4476c
LABEL	(".Ltail_4x");
Packit c4476c
if (!$z) {
Packit c4476c
	vlr	($xt0,$xb0);
Packit c4476c
	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
Packit c4476c
	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
Packit c4476c
Packit c4476c
	vst	($xa0,"$stdframe+0x00($sp)");
Packit c4476c
	vst	($xt0,"$stdframe+0x10($sp)");
Packit c4476c
	vst	($xc0,"$stdframe+0x20($sp)");
Packit c4476c
	vst	($xd0,"$stdframe+0x30($sp)");
Packit c4476c
} else {
Packit c4476c
	vlr	($xt0,$xc0);
Packit c4476c
	ld	("%f8","$stdframe+8*0($sp)");
Packit c4476c
	ld	("%f9","$stdframe+8*1($sp)");
Packit c4476c
	ld	("%f10","$stdframe+8*2($sp)");
Packit c4476c
	ld	("%f11","$stdframe+8*3($sp)");
Packit c4476c
	vlr	($xt1,$xd0);
Packit c4476c
	ld	("%f12","$stdframe+8*4($sp)");
Packit c4476c
	ld	("%f13","$stdframe+8*5($sp)");
Packit c4476c
	ld	("%f14","$stdframe+8*6($sp)");
Packit c4476c
	ld	("%f15","$stdframe+8*7($sp)");
Packit c4476c
Packit c4476c
	vst	($xa0,"$stdframe+0x00($sp)");
Packit c4476c
	vst	($xb0,"$stdframe+0x10($sp)");
Packit c4476c
	vst	($xt0,"$stdframe+0x20($sp)");
Packit c4476c
	vst	($xt1,"$stdframe+0x30($sp)");
Packit c4476c
}
Packit c4476c
	lghi	("%r1",0);
Packit c4476c
Packit c4476c
LABEL	(".Loop_tail_4x");
Packit c4476c
	llgc	("%r5","0(%r1,$inp)");
Packit c4476c
	llgc	("%r6","$stdframe(%r1,$sp)");
Packit c4476c
	xr	("%r6","%r5");
Packit c4476c
	stc	("%r6","0(%r1,$out)");
Packit c4476c
	la	("%r1","1(%r1)");
Packit c4476c
	brct	($len,".Loop_tail_4x");
Packit c4476c
Packit c4476c
&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
Packit c4476c
	la	($sp,"$FRAME($sp)");
Packit c4476c
	br	("%r14");
Packit c4476c
SIZE	("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
Packit c4476c
}
Packit c4476c
Packit c4476c
########################################################################
Packit c4476c
# 6x"horizontal" layout is optimal fit for the platform in its current
Packit c4476c
# shape, more specifically for given vector instructions' latency. Well,
Packit c4476c
# computational part of 8x"vertical" would be faster, but it consumes
Packit c4476c
# all registers and dealing with that will diminish the return...
Packit c4476c
#
Packit c4476c
{
Packit c4476c
my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
Packit c4476c
    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
Packit c4476c
    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
Packit c4476c
my @K=map("%v$_",(27,24..26));
Packit c4476c
my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
Packit c4476c
my $beperm="%v31";
Packit c4476c
my $FRAME=$stdframe + 4*16;
Packit c4476c
Packit c4476c
GLOBL	("ChaCha20_ctr32_vx");
Packit c4476c
ALIGN	(32);
Packit c4476c
LABEL	("ChaCha20_ctr32_vx");
Packit c4476c
LABEL	(".LChaCha20_ctr32_vx");
Packit c4476c
&{$z?	\&clgfi:\&clfi}	($len,256);
Packit c4476c
	jle	(".LChaCha20_ctr32_4x");
Packit c4476c
&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
Packit c4476c
if (!$z) {
Packit c4476c
	std	("%f4","16*$SIZE_T+2*8($sp)");
Packit c4476c
	std	("%f6","16*$SIZE_T+3*8($sp)");
Packit c4476c
}
Packit c4476c
&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
Packit c4476c
	lgr	("%r0",$sp);
Packit c4476c
	la	($sp,"0(%r1,$sp)");
Packit c4476c
&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
Packit c4476c
if ($z) {
Packit c4476c
	std	("%f8","$FRAME-8*8($sp)");
Packit c4476c
	std	("%f9","$FRAME-8*7($sp)");
Packit c4476c
	std	("%f10","$FRAME-8*6($sp)");
Packit c4476c
	std	("%f11","$FRAME-8*5($sp)");
Packit c4476c
	std	("%f12","$FRAME-8*4($sp)");
Packit c4476c
	std	("%f13","$FRAME-8*3($sp)");
Packit c4476c
	std	("%f14","$FRAME-8*2($sp)");
Packit c4476c
	std	("%f15","$FRAME-8*1($sp)");
Packit c4476c
}
Packit c4476c
	larl	("%r7",".Lsigma");
Packit c4476c
	lhi	("%r0",10);
Packit c4476c
Packit c4476c
	vlm	(@K[1],@K[2],"0($key)");	# load key
Packit c4476c
	vl	(@K[3],"0($counter)");		# load counter
Packit c4476c
Packit c4476c
	vlm	(@K[0],"$beperm","0(%r7)");	# load sigma, increments, ...
Packit c4476c
Packit c4476c
LABEL	(".Loop_outer_vx");
Packit c4476c
	vlr	($a0,@K[0]);
Packit c4476c
	vlr	($b0,@K[1]);
Packit c4476c
	vlr	($a1,@K[0]);
Packit c4476c
	vlr	($b1,@K[1]);
Packit c4476c
	vlr	($a2,@K[0]);
Packit c4476c
	vlr	($b2,@K[1]);
Packit c4476c
	vlr	($a3,@K[0]);
Packit c4476c
	vlr	($b3,@K[1]);
Packit c4476c
	vlr	($a4,@K[0]);
Packit c4476c
	vlr	($b4,@K[1]);
Packit c4476c
	vlr	($a5,@K[0]);
Packit c4476c
	vlr	($b5,@K[1]);
Packit c4476c
Packit c4476c
	vlr	($d0,@K[3]);
Packit c4476c
	vaf	($d1,@K[3],$t1);		# K[3]+1
Packit c4476c
	vaf	($d2,@K[3],$t2);		# K[3]+2
Packit c4476c
	vaf	($d3,@K[3],$t3);		# K[3]+3
Packit c4476c
	vaf	($d4,$d2,$t2);			# K[3]+4
Packit c4476c
	vaf	($d5,$d2,$t3);			# K[3]+5
Packit c4476c
Packit c4476c
	vlr	($c0,@K[2]);
Packit c4476c
	vlr	($c1,@K[2]);
Packit c4476c
	vlr	($c2,@K[2]);
Packit c4476c
	vlr	($c3,@K[2]);
Packit c4476c
	vlr	($c4,@K[2]);
Packit c4476c
	vlr	($c5,@K[2]);
Packit c4476c
Packit c4476c
	vlr	($t1,$d1);
Packit c4476c
	vlr	($t2,$d2);
Packit c4476c
	vlr	($t3,$d3);
Packit c4476c
Packit c4476c
ALIGN	(4);
Packit c4476c
LABEL	(".Loop_vx");
Packit c4476c
Packit c4476c
	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
Packit c4476c
		 $b0,$b1,$b2,$b3,$b4,$b5,
Packit c4476c
		 $c0,$c1,$c2,$c3,$c4,$c5,
Packit c4476c
		 $d0,$d1,$d2,$d3,$d4,$d5,
Packit c4476c
		 0);
Packit c4476c
Packit c4476c
	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
Packit c4476c
		 $b0,$b1,$b2,$b3,$b4,$b5,
Packit c4476c
		 $c0,$c1,$c2,$c3,$c4,$c5,
Packit c4476c
		 $d0,$d1,$d2,$d3,$d4,$d5,
Packit c4476c
		 1);
Packit c4476c
Packit c4476c
	brct	("%r0",".Loop_vx");
Packit c4476c
Packit c4476c
	vaf	($a0,$a0,@K[0]);
Packit c4476c
	vaf	($b0,$b0,@K[1]);
Packit c4476c
	vaf	($c0,$c0,@K[2]);
Packit c4476c
	vaf	($d0,$d0,@K[3]);
Packit c4476c
	vaf	($a1,$a1,@K[0]);
Packit c4476c
	vaf	($d1,$d1,$t1);			# +K[3]+1
Packit c4476c
Packit c4476c
	vperm	($a0,$a0,$a0,$beperm);
Packit c4476c
	vperm	($b0,$b0,$b0,$beperm);
Packit c4476c
	vperm	($c0,$c0,$c0,$beperm);
Packit c4476c
	vperm	($d0,$d0,$d0,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi}	($len,0x40);
Packit c4476c
	jl	(".Ltail_vx");
Packit c4476c
Packit c4476c
	vaf	($d2,$d2,$t2);			# +K[3]+2
Packit c4476c
	vaf	($d3,$d3,$t3);			# +K[3]+3
Packit c4476c
	vlm	($t0,$t3,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($a0,$a0,$t0);
Packit c4476c
	vx	($b0,$b0,$t1);
Packit c4476c
	vx	($c0,$c0,$t2);
Packit c4476c
	vx	($d0,$d0,$t3);
Packit c4476c
Packit c4476c
	vlm	(@K[0],$t3,"0(%r7)");		# re-load sigma and increments
Packit c4476c
Packit c4476c
	vstm	($a0,$d0,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_vx");
Packit c4476c
Packit c4476c
	vaf	($b1,$b1,@K[1]);
Packit c4476c
	vaf	($c1,$c1,@K[2]);
Packit c4476c
Packit c4476c
	vperm	($a0,$a1,$a1,$beperm);
Packit c4476c
	vperm	($b0,$b1,$b1,$beperm);
Packit c4476c
	vperm	($c0,$c1,$c1,$beperm);
Packit c4476c
	vperm	($d0,$d1,$d1,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi} ($len,0x40);
Packit c4476c
	jl	(".Ltail_vx");
Packit c4476c
Packit c4476c
	vlm	($a1,$d1,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($a0,$a0,$a1);
Packit c4476c
	vx	($b0,$b0,$b1);
Packit c4476c
	vx	($c0,$c0,$c1);
Packit c4476c
	vx	($d0,$d0,$d1);
Packit c4476c
Packit c4476c
	vstm	($a0,$d0,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_vx");
Packit c4476c
Packit c4476c
	vaf	($a2,$a2,@K[0]);
Packit c4476c
	vaf	($b2,$b2,@K[1]);
Packit c4476c
	vaf	($c2,$c2,@K[2]);
Packit c4476c
Packit c4476c
	vperm	($a0,$a2,$a2,$beperm);
Packit c4476c
	vperm	($b0,$b2,$b2,$beperm);
Packit c4476c
	vperm	($c0,$c2,$c2,$beperm);
Packit c4476c
	vperm	($d0,$d2,$d2,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi}	($len,0x40);
Packit c4476c
	jl	(".Ltail_vx");
Packit c4476c
Packit c4476c
	vlm	($a1,$d1,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($a0,$a0,$a1);
Packit c4476c
	vx	($b0,$b0,$b1);
Packit c4476c
	vx	($c0,$c0,$c1);
Packit c4476c
	vx	($d0,$d0,$d1);
Packit c4476c
Packit c4476c
	vstm	($a0,$d0,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_vx");
Packit c4476c
Packit c4476c
	vaf	($a3,$a3,@K[0]);
Packit c4476c
	vaf	($b3,$b3,@K[1]);
Packit c4476c
	vaf	($c3,$c3,@K[2]);
Packit c4476c
	vaf	($d2,@K[3],$t3);		# K[3]+3
Packit c4476c
Packit c4476c
	vperm	($a0,$a3,$a3,$beperm);
Packit c4476c
	vperm	($b0,$b3,$b3,$beperm);
Packit c4476c
	vperm	($c0,$c3,$c3,$beperm);
Packit c4476c
	vperm	($d0,$d3,$d3,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi}	($len,0x40);
Packit c4476c
	jl	(".Ltail_vx");
Packit c4476c
Packit c4476c
	vaf	($d3,$d2,$t1);			# K[3]+4
Packit c4476c
	vlm	($a1,$d1,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($a0,$a0,$a1);
Packit c4476c
	vx	($b0,$b0,$b1);
Packit c4476c
	vx	($c0,$c0,$c1);
Packit c4476c
	vx	($d0,$d0,$d1);
Packit c4476c
Packit c4476c
	vstm	($a0,$d0,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_vx");
Packit c4476c
Packit c4476c
	vaf	($a4,$a4,@K[0]);
Packit c4476c
	vaf	($b4,$b4,@K[1]);
Packit c4476c
	vaf	($c4,$c4,@K[2]);
Packit c4476c
	vaf	($d4,$d4,$d3);			# +K[3]+4
Packit c4476c
	vaf	($d3,$d3,$t1);			# K[3]+5
Packit c4476c
	vaf	(@K[3],$d2,$t3);		# K[3]+=6
Packit c4476c
Packit c4476c
	vperm	($a0,$a4,$a4,$beperm);
Packit c4476c
	vperm	($b0,$b4,$b4,$beperm);
Packit c4476c
	vperm	($c0,$c4,$c4,$beperm);
Packit c4476c
	vperm	($d0,$d4,$d4,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi}	($len,0x40);
Packit c4476c
	jl	(".Ltail_vx");
Packit c4476c
Packit c4476c
	vlm	($a1,$d1,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($a0,$a0,$a1);
Packit c4476c
	vx	($b0,$b0,$b1);
Packit c4476c
	vx	($c0,$c0,$c1);
Packit c4476c
	vx	($d0,$d0,$d1);
Packit c4476c
Packit c4476c
	vstm	($a0,$d0,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	je	(".Ldone_vx");
Packit c4476c
Packit c4476c
	vaf	($a5,$a5,@K[0]);
Packit c4476c
	vaf	($b5,$b5,@K[1]);
Packit c4476c
	vaf	($c5,$c5,@K[2]);
Packit c4476c
	vaf	($d5,$d5,$d3);			# +K[3]+5
Packit c4476c
Packit c4476c
	vperm	($a0,$a5,$a5,$beperm);
Packit c4476c
	vperm	($b0,$b5,$b5,$beperm);
Packit c4476c
	vperm	($c0,$c5,$c5,$beperm);
Packit c4476c
	vperm	($d0,$d5,$d5,$beperm);
Packit c4476c
Packit c4476c
&{$z?	\&clgfi:\&clfi} ($len,0x40);
Packit c4476c
	jl	(".Ltail_vx");
Packit c4476c
Packit c4476c
	vlm	($a1,$d1,"0($inp)");
Packit c4476c
Packit c4476c
	vx	($a0,$a0,$a1);
Packit c4476c
	vx	($b0,$b0,$b1);
Packit c4476c
	vx	($c0,$c0,$c1);
Packit c4476c
	vx	($d0,$d0,$d1);
Packit c4476c
Packit c4476c
	vstm	($a0,$d0,"0($out)");
Packit c4476c
Packit c4476c
	la	($inp,"0x40($inp)");
Packit c4476c
	la	($out,"0x40($out)");
Packit c4476c
	lhi	("%r0",10);
Packit c4476c
&{$z?	\&aghi:\&ahi}	($len,-0x40);
Packit c4476c
	jne	(".Loop_outer_vx");
Packit c4476c
Packit c4476c
LABEL	(".Ldone_vx");
Packit c4476c
if (!$z) {
Packit c4476c
	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
Packit c4476c
	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
Packit c4476c
} else {
Packit c4476c
	ld	("%f8","$FRAME-8*8($sp)");
Packit c4476c
	ld	("%f9","$FRAME-8*7($sp)");
Packit c4476c
	ld	("%f10","$FRAME-8*6($sp)");
Packit c4476c
	ld	("%f11","$FRAME-8*5($sp)");
Packit c4476c
	ld	("%f12","$FRAME-8*4($sp)");
Packit c4476c
	ld	("%f13","$FRAME-8*3($sp)");
Packit c4476c
	ld	("%f14","$FRAME-8*2($sp)");
Packit c4476c
	ld	("%f15","$FRAME-8*1($sp)");
Packit c4476c
}
Packit c4476c
&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
Packit c4476c
	la	($sp,"$FRAME($sp)");
Packit c4476c
	br	("%r14");
Packit c4476c
Packit c4476c
ALIGN	(16);
Packit c4476c
LABEL	(".Ltail_vx");
Packit c4476c
if (!$z) {
Packit c4476c
	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
Packit c4476c
	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
Packit c4476c
} else {
Packit c4476c
	ld	("%f8","$FRAME-8*8($sp)");
Packit c4476c
	ld	("%f9","$FRAME-8*7($sp)");
Packit c4476c
	ld	("%f10","$FRAME-8*6($sp)");
Packit c4476c
	ld	("%f11","$FRAME-8*5($sp)");
Packit c4476c
	ld	("%f12","$FRAME-8*4($sp)");
Packit c4476c
	ld	("%f13","$FRAME-8*3($sp)");
Packit c4476c
	ld	("%f14","$FRAME-8*2($sp)");
Packit c4476c
	ld	("%f15","$FRAME-8*1($sp)");
Packit c4476c
}
Packit c4476c
	vstm	($a0,$d0,"$stdframe($sp)");
Packit c4476c
	lghi	("%r1",0);
Packit c4476c
Packit c4476c
LABEL	(".Loop_tail_vx");
Packit c4476c
	llgc	("%r5","0(%r1,$inp)");
Packit c4476c
	llgc	("%r6","$stdframe(%r1,$sp)");
Packit c4476c
	xr	("%r6","%r5");
Packit c4476c
	stc	("%r6","0(%r1,$out)");
Packit c4476c
	la	("%r1","1(%r1)");
Packit c4476c
	brct	($len,".Loop_tail_vx");
Packit c4476c
Packit c4476c
&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
Packit c4476c
	la	($sp,"$FRAME($sp)");
Packit c4476c
	br	("%r14");
Packit c4476c
SIZE	("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
Packit c4476c
}
Packit c4476c
################
Packit c4476c
Packit c4476c
ALIGN	(32);
Packit c4476c
LABEL	(".Lsigma");
Packit c4476c
LONG	(0x61707865,0x3320646e,0x79622d32,0x6b206574);	# endian-neutral sigma
Packit c4476c
LONG	(1,0,0,0);
Packit c4476c
LONG	(2,0,0,0);
Packit c4476c
LONG	(3,0,0,0);
Packit c4476c
LONG	(0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);	# byte swap
Packit c4476c
Packit c4476c
LONG	(0,1,2,3);
Packit c4476c
LONG	(0x61707865,0x61707865,0x61707865,0x61707865);	# smashed sigma
Packit c4476c
LONG	(0x3320646e,0x3320646e,0x3320646e,0x3320646e);
Packit c4476c
LONG	(0x79622d32,0x79622d32,0x79622d32,0x79622d32);
Packit c4476c
LONG	(0x6b206574,0x6b206574,0x6b206574,0x6b206574);
Packit c4476c
Packit c4476c
ASCIZ	("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
Packit c4476c
ALIGN	(4);
Packit c4476c
Packit c4476c
PERLASM_END();