Blame crypto/sha/asm/sha512-sparcv9.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
#
Packit c4476c
# Hardware SPARC T4 support by David S. Miller
Packit c4476c
# ====================================================================
Packit c4476c
Packit c4476c
# SHA256 performance improvement over compiler generated code varies
Packit c4476c
# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
Packit c4476c
# build]. Just like in SHA1 module I aim to ensure scalability on
Packit c4476c
# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
Packit c4476c
Packit c4476c
# SHA512 on pre-T1 UltraSPARC.
Packit c4476c
#
Packit c4476c
# Performance is >75% better than 64-bit code generated by Sun C and
Packit c4476c
# over 2x than 32-bit code. X[16] resides on stack, but access to it
Packit c4476c
# is scheduled for L2 latency and staged through 32 least significant
Packit c4476c
# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
Packit c4476c
# duality. Nevertheless it's ~40% faster than SHA256, which is pretty
Packit c4476c
# good [optimal coefficient is 50%].
Packit c4476c
#
Packit c4476c
# SHA512 on UltraSPARC T1.
Packit c4476c
#
Packit c4476c
# It's not any faster than 64-bit code generated by Sun C 5.8. This is
Packit c4476c
# because 64-bit code generator has the advantage of using 64-bit
Packit c4476c
# loads(*) to access X[16], which I consciously traded for 32-/64-bit
Packit c4476c
# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
Packit c4476c
# code by 60%, not to mention that it doesn't suffer from severe decay
Packit c4476c
# when running 4 times physical cores threads and that it leaves gcc
Packit c4476c
# [3.4] behind by over 4x factor! If compared to SHA256, single thread
Packit c4476c
# performance is only 10% better, but overall throughput for maximum
Packit c4476c
# amount of threads for given CPU exceeds corresponding one of SHA256
Packit c4476c
# by 30% [again, optimal coefficient is 50%].
Packit c4476c
#
Packit c4476c
# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
Packit c4476c
#	in-order, i.e. load instruction has to complete prior next
Packit c4476c
#	instruction in given thread is executed, even if the latter is
Packit c4476c
#	not dependent on load result! This means that on T1 two 32-bit
Packit c4476c
#	loads are always slower than one 64-bit load. Once again this
Packit c4476c
#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
Packit c4476c
#	2x32-bit loads can be as fast as 1x64-bit ones.
Packit c4476c
#
Packit c4476c
# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
Packit c4476c
# which is 9.3x/11.1x faster than software. Multi-process benchmark
Packit c4476c
# saturates at 11.5x single-process result on 8-core processor, or
Packit c4476c
# ~11/16GBps per 2.85GHz socket.
Packit c4476c
Packit c4476c
$output=pop;
Packit c4476c
open STDOUT,">$output";
Packit c4476c
Packit c4476c
if ($output =~ /512/) {
Packit c4476c
	$label="512";
Packit c4476c
	$SZ=8;
Packit c4476c
	$LD="ldx";		# load from memory
Packit c4476c
	$ST="stx";		# store to memory
Packit c4476c
	$SLL="sllx";		# shift left logical
Packit c4476c
	$SRL="srlx";		# shift right logical
Packit c4476c
	@Sigma0=(28,34,39);
Packit c4476c
	@Sigma1=(14,18,41);
Packit c4476c
	@sigma0=( 7, 1, 8);	# right shift first
Packit c4476c
	@sigma1=( 6,19,61);	# right shift first
Packit c4476c
	$lastK=0x817;
Packit c4476c
	$rounds=80;
Packit c4476c
	$align=4;
Packit c4476c
Packit c4476c
	$locals=16*$SZ;		# X[16]
Packit c4476c
Packit c4476c
	$A="%o0";
Packit c4476c
	$B="%o1";
Packit c4476c
	$C="%o2";
Packit c4476c
	$D="%o3";
Packit c4476c
	$E="%o4";
Packit c4476c
	$F="%o5";
Packit c4476c
	$G="%g1";
Packit c4476c
	$H="%o7";
Packit c4476c
	@V=($A,$B,$C,$D,$E,$F,$G,$H);
Packit c4476c
} else {
Packit c4476c
	$label="256";
Packit c4476c
	$SZ=4;
Packit c4476c
	$LD="ld";		# load from memory
Packit c4476c
	$ST="st";		# store to memory
Packit c4476c
	$SLL="sll";		# shift left logical
Packit c4476c
	$SRL="srl";		# shift right logical
Packit c4476c
	@Sigma0=( 2,13,22);
Packit c4476c
	@Sigma1=( 6,11,25);
Packit c4476c
	@sigma0=( 3, 7,18);	# right shift first
Packit c4476c
	@sigma1=(10,17,19);	# right shift first
Packit c4476c
	$lastK=0x8f2;
Packit c4476c
	$rounds=64;
Packit c4476c
	$align=8;
Packit c4476c
Packit c4476c
	$locals=0;		# X[16] is register resident
Packit c4476c
	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
Packit c4476c
Packit c4476c
	$A="%l0";
Packit c4476c
	$B="%l1";
Packit c4476c
	$C="%l2";
Packit c4476c
	$D="%l3";
Packit c4476c
	$E="%l4";
Packit c4476c
	$F="%l5";
Packit c4476c
	$G="%l6";
Packit c4476c
	$H="%l7";
Packit c4476c
	@V=($A,$B,$C,$D,$E,$F,$G,$H);
Packit c4476c
}
Packit c4476c
$T1="%g2";
Packit c4476c
$tmp0="%g3";
Packit c4476c
$tmp1="%g4";
Packit c4476c
$tmp2="%g5";
Packit c4476c
Packit c4476c
$ctx="%i0";
Packit c4476c
$inp="%i1";
Packit c4476c
$len="%i2";
Packit c4476c
$Ktbl="%i3";
Packit c4476c
$tmp31="%i4";
Packit c4476c
$tmp32="%i5";
Packit c4476c
Packit c4476c
########### SHA256
Packit c4476c
$Xload = sub {
Packit c4476c
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
Packit c4476c
Packit c4476c
    if ($i==0) {
Packit c4476c
$code.=<<___;
Packit c4476c
	ldx	[$inp+0],@X[0]
Packit c4476c
	ldx	[$inp+16],@X[2]
Packit c4476c
	ldx	[$inp+32],@X[4]
Packit c4476c
	ldx	[$inp+48],@X[6]
Packit c4476c
	ldx	[$inp+8],@X[1]
Packit c4476c
	ldx	[$inp+24],@X[3]
Packit c4476c
	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
Packit c4476c
	ldx	[$inp+40],@X[5]
Packit c4476c
	bz,pt	%icc,.Laligned
Packit c4476c
	ldx	[$inp+56],@X[7]
Packit c4476c
Packit c4476c
	sllx	@X[0],$tmp31,@X[0]
Packit c4476c
	ldx	[$inp+64],$T1
Packit c4476c
___
Packit c4476c
for($j=0;$j<7;$j++)
Packit c4476c
{   $code.=<<___;
Packit c4476c
	srlx	@X[$j+1],$tmp32,$tmp1
Packit c4476c
	sllx	@X[$j+1],$tmp31,@X[$j+1]
Packit c4476c
	or	$tmp1,@X[$j],@X[$j]
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
	srlx	$T1,$tmp32,$T1
Packit c4476c
	or	$T1,@X[7],@X[7]
Packit c4476c
.Laligned:
Packit c4476c
___
Packit c4476c
    }
Packit c4476c
Packit c4476c
    if ($i&1) {
Packit c4476c
	$code.="\tadd	@X[$i/2],$h,$T1\n";
Packit c4476c
    } else {
Packit c4476c
	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
Packit c4476c
    }
Packit c4476c
} if ($SZ==4);
Packit c4476c
Packit c4476c
########### SHA512
Packit c4476c
$Xload = sub {
Packit c4476c
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
Packit c4476c
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
Packit c4476c
Packit c4476c
$code.=<<___ if ($i==0);
Packit c4476c
	ld	[$inp+0],%l0
Packit c4476c
	ld	[$inp+4],%l1
Packit c4476c
	ld	[$inp+8],%l2
Packit c4476c
	ld	[$inp+12],%l3
Packit c4476c
	ld	[$inp+16],%l4
Packit c4476c
	ld	[$inp+20],%l5
Packit c4476c
	ld	[$inp+24],%l6
Packit c4476c
	cmp	$tmp31,0
Packit c4476c
	ld	[$inp+28],%l7
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($i<15);
Packit c4476c
	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
Packit c4476c
	add	$tmp31,32,$tmp0
Packit c4476c
	sllx	@pair[0],$tmp0,$tmp1
Packit c4476c
	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
Packit c4476c
	srlx	@pair[2],$tmp32,@pair[1]
Packit c4476c
	or	$tmp1,$tmp2,$tmp2
Packit c4476c
	or	@pair[1],$tmp2,$tmp2
Packit c4476c
	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
Packit c4476c
	add	$h,$tmp2,$T1
Packit c4476c
	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($i==12);
Packit c4476c
	bnz,a,pn	%icc,.+8
Packit c4476c
	ld	[$inp+128],%l0
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($i==15);
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
Packit c4476c
	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
Packit c4476c
	add	$tmp31,32,$tmp0
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
Packit c4476c
	sllx	@pair[0],$tmp0,$tmp1
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
Packit c4476c
	srlx	@pair[2],$tmp32,@pair[1]
Packit c4476c
	or	$tmp1,$tmp2,$tmp2
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
Packit c4476c
	or	@pair[1],$tmp2,$tmp2
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
Packit c4476c
	add	$h,$tmp2,$T1
Packit c4476c
	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
Packit c4476c
___
Packit c4476c
} if ($SZ==8);
Packit c4476c
Packit c4476c
########### common
Packit c4476c
sub BODY_00_15 {
Packit c4476c
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
Packit c4476c
Packit c4476c
    if ($i<16) {
Packit c4476c
	&$Xload(@_);
Packit c4476c
    } else {
Packit c4476c
	$code.="\tadd	$h,$T1,$T1\n";
Packit c4476c
    }
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
	$SRL	$e,@Sigma1[0],$h	!! $i
Packit c4476c
	xor	$f,$g,$tmp2
Packit c4476c
	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
Packit c4476c
	and	$e,$tmp2,$tmp2
Packit c4476c
	$SRL	$e,@Sigma1[1],$tmp0
Packit c4476c
	xor	$tmp1,$h,$h
Packit c4476c
	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
Packit c4476c
	xor	$tmp0,$h,$h
Packit c4476c
	$SRL	$e,@Sigma1[2],$tmp0
Packit c4476c
	xor	$tmp1,$h,$h
Packit c4476c
	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
Packit c4476c
	xor	$tmp0,$h,$h
Packit c4476c
	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
Packit c4476c
	xor	$tmp1,$h,$tmp0		! Sigma1(e)
Packit c4476c
Packit c4476c
	$SRL	$a,@Sigma0[0],$h
Packit c4476c
	add	$tmp2,$T1,$T1
Packit c4476c
	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
Packit c4476c
	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
Packit c4476c
	add	$tmp0,$T1,$T1
Packit c4476c
	$SRL	$a,@Sigma0[1],$tmp0
Packit c4476c
	xor	$tmp1,$h,$h
Packit c4476c
	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
Packit c4476c
	xor	$tmp0,$h,$h
Packit c4476c
	$SRL	$a,@Sigma0[2],$tmp0
Packit c4476c
	xor	$tmp1,$h,$h
Packit c4476c
	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
Packit c4476c
	xor	$tmp0,$h,$h
Packit c4476c
	xor	$tmp1,$h,$h		! Sigma0(a)
Packit c4476c
Packit c4476c
	or	$a,$b,$tmp0
Packit c4476c
	and	$a,$b,$tmp1
Packit c4476c
	and	$c,$tmp0,$tmp0
Packit c4476c
	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
Packit c4476c
	add	$tmp2,$T1,$T1		! +=K[$i]
Packit c4476c
	add	$tmp1,$h,$h
Packit c4476c
Packit c4476c
	add	$T1,$d,$d
Packit c4476c
	add	$T1,$h,$h
Packit c4476c
___
Packit c4476c
}
Packit c4476c
Packit c4476c
########### SHA256
Packit c4476c
$BODY_16_XX = sub {
Packit c4476c
my $i=@_[0];
Packit c4476c
my $xi;
Packit c4476c
Packit c4476c
    if ($i&1) {
Packit c4476c
	$xi=$tmp32;
Packit c4476c
	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
Packit c4476c
    } else {
Packit c4476c
	$xi=@X[(($i+1)/2)%8];
Packit c4476c
    }
Packit c4476c
$code.=<<___;
Packit c4476c
	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
Packit c4476c
	sll	$xi,`32-@sigma0[2]`,$tmp1
Packit c4476c
	srl	$xi,@sigma0[1],$tmp0
Packit c4476c
	xor	$tmp1,$T1,$T1
Packit c4476c
	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
Packit c4476c
	xor	$tmp0,$T1,$T1
Packit c4476c
	srl	$xi,@sigma0[2],$tmp0
Packit c4476c
	xor	$tmp1,$T1,$T1
Packit c4476c
___
Packit c4476c
    if ($i&1) {
Packit c4476c
	$xi=@X[(($i+14)/2)%8];
Packit c4476c
    } else {
Packit c4476c
	$xi=$tmp32;
Packit c4476c
	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
Packit c4476c
    }
Packit c4476c
$code.=<<___;
Packit c4476c
	srl	$xi,@sigma1[0],$tmp2
Packit c4476c
	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
Packit c4476c
	sll	$xi,`32-@sigma1[2]`,$tmp1
Packit c4476c
	srl	$xi,@sigma1[1],$tmp0
Packit c4476c
	xor	$tmp1,$tmp2,$tmp2
Packit c4476c
	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
Packit c4476c
	xor	$tmp0,$tmp2,$tmp2
Packit c4476c
	srl	$xi,@sigma1[2],$tmp0
Packit c4476c
	xor	$tmp1,$tmp2,$tmp2
Packit c4476c
___
Packit c4476c
    if ($i&1) {
Packit c4476c
	$xi=@X[($i/2)%8];
Packit c4476c
$code.=<<___;
Packit c4476c
	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
Packit c4476c
	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
Packit c4476c
	srl	@X[($i/2)%8],0,$tmp0
Packit c4476c
	add	$tmp2,$tmp1,$tmp1
Packit c4476c
	add	$xi,$T1,$T1			! +=X[i]
Packit c4476c
	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
Packit c4476c
	add	$tmp1,$T1,$T1
Packit c4476c
Packit c4476c
	srl	$T1,0,$T1
Packit c4476c
	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
Packit c4476c
___
Packit c4476c
    } else {
Packit c4476c
	$xi=@X[(($i+9)/2)%8];
Packit c4476c
$code.=<<___;
Packit c4476c
	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
Packit c4476c
	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
Packit c4476c
	add	$xi,$T1,$T1			! +=X[i+9]
Packit c4476c
	add	$tmp2,$tmp1,$tmp1
Packit c4476c
	srl	@X[($i/2)%8],0,@X[($i/2)%8]
Packit c4476c
	add	$tmp1,$T1,$T1
Packit c4476c
Packit c4476c
	sllx	$T1,32,$tmp0
Packit c4476c
	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
Packit c4476c
___
Packit c4476c
    }
Packit c4476c
    &BODY_00_15(@_);
Packit c4476c
} if ($SZ==4);
Packit c4476c
Packit c4476c
########### SHA512
Packit c4476c
$BODY_16_XX = sub {
Packit c4476c
my $i=@_[0];
Packit c4476c
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
	sllx	%l2,32,$tmp0		!! Xupdate($i)
Packit c4476c
	or	%l3,$tmp0,$tmp0
Packit c4476c
Packit c4476c
	srlx	$tmp0,@sigma0[0],$T1
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
Packit c4476c
	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
Packit c4476c
	srlx	$tmp0,@sigma0[1],$tmp0
Packit c4476c
	xor	$tmp1,$T1,$T1
Packit c4476c
	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
Packit c4476c
	xor	$tmp0,$T1,$T1
Packit c4476c
	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
Packit c4476c
	xor	$tmp1,$T1,$T1
Packit c4476c
	sllx	%l6,32,$tmp2
Packit c4476c
	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
Packit c4476c
	or	%l7,$tmp2,$tmp2
Packit c4476c
Packit c4476c
	srlx	$tmp2,@sigma1[0],$tmp1
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
Packit c4476c
	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
Packit c4476c
	srlx	$tmp2,@sigma1[1],$tmp2
Packit c4476c
	xor	$tmp0,$tmp1,$tmp1
Packit c4476c
	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
Packit c4476c
	xor	$tmp2,$tmp1,$tmp1
Packit c4476c
	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
Packit c4476c
	xor	$tmp0,$tmp1,$tmp1
Packit c4476c
	sllx	%l4,32,$tmp0
Packit c4476c
	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
Packit c4476c
	or	%l5,$tmp0,$tmp0
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
Packit c4476c
Packit c4476c
	sllx	%l0,32,$tmp2
Packit c4476c
	add	$tmp1,$T1,$T1
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
Packit c4476c
	or	%l1,$tmp2,$tmp2
Packit c4476c
	add	$tmp0,$T1,$T1		! +=X[$i+9]
Packit c4476c
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
Packit c4476c
	add	$tmp2,$T1,$T1		! +=X[$i]
Packit c4476c
	$ST	$T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
Packit c4476c
___
Packit c4476c
    &BODY_00_15(@_);
Packit c4476c
} if ($SZ==8);
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
#include "sparc_arch.h"
Packit c4476c
Packit c4476c
#ifdef __arch64__
Packit c4476c
.register	%g2,#scratch
Packit c4476c
.register	%g3,#scratch
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.section	".text",#alloc,#execinstr
Packit c4476c
Packit c4476c
.align	64
Packit c4476c
K${label}:
Packit c4476c
.type	K${label},#object
Packit c4476c
___
Packit c4476c
if ($SZ==4) {
Packit c4476c
$code.=<<___;
Packit c4476c
	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
Packit c4476c
	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
Packit c4476c
	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
Packit c4476c
	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
Packit c4476c
	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
Packit c4476c
	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
Packit c4476c
	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
Packit c4476c
	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
Packit c4476c
	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
Packit c4476c
	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
Packit c4476c
	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
Packit c4476c
	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
Packit c4476c
	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
Packit c4476c
	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
Packit c4476c
	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
Packit c4476c
	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
Packit c4476c
___
Packit c4476c
} else {
Packit c4476c
$code.=<<___;
Packit c4476c
	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
Packit c4476c
	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
Packit c4476c
	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
Packit c4476c
	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
Packit c4476c
	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
Packit c4476c
	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
Packit c4476c
	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
Packit c4476c
	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
Packit c4476c
	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
Packit c4476c
	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
Packit c4476c
	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
Packit c4476c
	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
Packit c4476c
	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
Packit c4476c
	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
Packit c4476c
	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
Packit c4476c
	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
Packit c4476c
	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
Packit c4476c
	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
Packit c4476c
	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
Packit c4476c
	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
Packit c4476c
	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
Packit c4476c
	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
Packit c4476c
	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
Packit c4476c
	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
Packit c4476c
	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
Packit c4476c
	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
Packit c4476c
	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
Packit c4476c
	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
Packit c4476c
	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
Packit c4476c
	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
Packit c4476c
	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
Packit c4476c
	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
Packit c4476c
	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
Packit c4476c
	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
Packit c4476c
	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
Packit c4476c
	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
Packit c4476c
	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
Packit c4476c
	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
Packit c4476c
	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
Packit c4476c
	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
.size	K${label},.-K${label}
Packit c4476c
Packit c4476c
#ifdef __PIC__
Packit c4476c
SPARC_PIC_THUNK(%g1)
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.globl	sha${label}_block_data_order
Packit c4476c
.align	32
Packit c4476c
sha${label}_block_data_order:
Packit c4476c
	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
Packit c4476c
	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
Packit c4476c
Packit c4476c
	andcc	%g1, CFR_SHA${label}, %g0
Packit c4476c
	be	.Lsoftware
Packit c4476c
	nop
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($SZ==8); 		# SHA512
Packit c4476c
	ldd	[%o0 + 0x00], %f0	! load context
Packit c4476c
	ldd	[%o0 + 0x08], %f2
Packit c4476c
	ldd	[%o0 + 0x10], %f4
Packit c4476c
	ldd	[%o0 + 0x18], %f6
Packit c4476c
	ldd	[%o0 + 0x20], %f8
Packit c4476c
	ldd	[%o0 + 0x28], %f10
Packit c4476c
	andcc	%o1, 0x7, %g0
Packit c4476c
	ldd	[%o0 + 0x30], %f12
Packit c4476c
	bne,pn	%icc, .Lhwunaligned
Packit c4476c
	 ldd	[%o0 + 0x38], %f14
Packit c4476c
Packit c4476c
.Lhwaligned_loop:
Packit c4476c
	ldd	[%o1 + 0x00], %f16
Packit c4476c
	ldd	[%o1 + 0x08], %f18
Packit c4476c
	ldd	[%o1 + 0x10], %f20
Packit c4476c
	ldd	[%o1 + 0x18], %f22
Packit c4476c
	ldd	[%o1 + 0x20], %f24
Packit c4476c
	ldd	[%o1 + 0x28], %f26
Packit c4476c
	ldd	[%o1 + 0x30], %f28
Packit c4476c
	ldd	[%o1 + 0x38], %f30
Packit c4476c
	ldd	[%o1 + 0x40], %f32
Packit c4476c
	ldd	[%o1 + 0x48], %f34
Packit c4476c
	ldd	[%o1 + 0x50], %f36
Packit c4476c
	ldd	[%o1 + 0x58], %f38
Packit c4476c
	ldd	[%o1 + 0x60], %f40
Packit c4476c
	ldd	[%o1 + 0x68], %f42
Packit c4476c
	ldd	[%o1 + 0x70], %f44
Packit c4476c
	subcc	%o2, 1, %o2		! done yet?
Packit c4476c
	ldd	[%o1 + 0x78], %f46
Packit c4476c
	add	%o1, 0x80, %o1
Packit c4476c
	prefetch [%o1 + 63], 20
Packit c4476c
	prefetch [%o1 + 64+63], 20
Packit c4476c
Packit c4476c
	.word	0x81b02860		! SHA512
Packit c4476c
Packit c4476c
	bne,pt	SIZE_T_CC, .Lhwaligned_loop
Packit c4476c
	nop
Packit c4476c
Packit c4476c
.Lhwfinish:
Packit c4476c
	std	%f0, [%o0 + 0x00]	! store context
Packit c4476c
	std	%f2, [%o0 + 0x08]
Packit c4476c
	std	%f4, [%o0 + 0x10]
Packit c4476c
	std	%f6, [%o0 + 0x18]
Packit c4476c
	std	%f8, [%o0 + 0x20]
Packit c4476c
	std	%f10, [%o0 + 0x28]
Packit c4476c
	std	%f12, [%o0 + 0x30]
Packit c4476c
	retl
Packit c4476c
	 std	%f14, [%o0 + 0x38]
Packit c4476c
Packit c4476c
.align	16
Packit c4476c
.Lhwunaligned:
Packit c4476c
	alignaddr %o1, %g0, %o1
Packit c4476c
Packit c4476c
	ldd	[%o1 + 0x00], %f18
Packit c4476c
.Lhwunaligned_loop:
Packit c4476c
	ldd	[%o1 + 0x08], %f20
Packit c4476c
	ldd	[%o1 + 0x10], %f22
Packit c4476c
	ldd	[%o1 + 0x18], %f24
Packit c4476c
	ldd	[%o1 + 0x20], %f26
Packit c4476c
	ldd	[%o1 + 0x28], %f28
Packit c4476c
	ldd	[%o1 + 0x30], %f30
Packit c4476c
	ldd	[%o1 + 0x38], %f32
Packit c4476c
	ldd	[%o1 + 0x40], %f34
Packit c4476c
	ldd	[%o1 + 0x48], %f36
Packit c4476c
	ldd	[%o1 + 0x50], %f38
Packit c4476c
	ldd	[%o1 + 0x58], %f40
Packit c4476c
	ldd	[%o1 + 0x60], %f42
Packit c4476c
	ldd	[%o1 + 0x68], %f44
Packit c4476c
	ldd	[%o1 + 0x70], %f46
Packit c4476c
	ldd	[%o1 + 0x78], %f48
Packit c4476c
	subcc	%o2, 1, %o2		! done yet?
Packit c4476c
	ldd	[%o1 + 0x80], %f50
Packit c4476c
	add	%o1, 0x80, %o1
Packit c4476c
	prefetch [%o1 + 63], 20
Packit c4476c
	prefetch [%o1 + 64+63], 20
Packit c4476c
Packit c4476c
	faligndata %f18, %f20, %f16
Packit c4476c
	faligndata %f20, %f22, %f18
Packit c4476c
	faligndata %f22, %f24, %f20
Packit c4476c
	faligndata %f24, %f26, %f22
Packit c4476c
	faligndata %f26, %f28, %f24
Packit c4476c
	faligndata %f28, %f30, %f26
Packit c4476c
	faligndata %f30, %f32, %f28
Packit c4476c
	faligndata %f32, %f34, %f30
Packit c4476c
	faligndata %f34, %f36, %f32
Packit c4476c
	faligndata %f36, %f38, %f34
Packit c4476c
	faligndata %f38, %f40, %f36
Packit c4476c
	faligndata %f40, %f42, %f38
Packit c4476c
	faligndata %f42, %f44, %f40
Packit c4476c
	faligndata %f44, %f46, %f42
Packit c4476c
	faligndata %f46, %f48, %f44
Packit c4476c
	faligndata %f48, %f50, %f46
Packit c4476c
Packit c4476c
	.word	0x81b02860		! SHA512
Packit c4476c
Packit c4476c
	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
Packit c4476c
	for	%f50, %f50, %f18	! %f18=%f50
Packit c4476c
Packit c4476c
	ba	.Lhwfinish
Packit c4476c
	nop
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($SZ==4); 		# SHA256
Packit c4476c
	ld	[%o0 + 0x00], %f0
Packit c4476c
	ld	[%o0 + 0x04], %f1
Packit c4476c
	ld	[%o0 + 0x08], %f2
Packit c4476c
	ld	[%o0 + 0x0c], %f3
Packit c4476c
	ld	[%o0 + 0x10], %f4
Packit c4476c
	ld	[%o0 + 0x14], %f5
Packit c4476c
	andcc	%o1, 0x7, %g0
Packit c4476c
	ld	[%o0 + 0x18], %f6
Packit c4476c
	bne,pn	%icc, .Lhwunaligned
Packit c4476c
	 ld	[%o0 + 0x1c], %f7
Packit c4476c
Packit c4476c
.Lhwloop:
Packit c4476c
	ldd	[%o1 + 0x00], %f8
Packit c4476c
	ldd	[%o1 + 0x08], %f10
Packit c4476c
	ldd	[%o1 + 0x10], %f12
Packit c4476c
	ldd	[%o1 + 0x18], %f14
Packit c4476c
	ldd	[%o1 + 0x20], %f16
Packit c4476c
	ldd	[%o1 + 0x28], %f18
Packit c4476c
	ldd	[%o1 + 0x30], %f20
Packit c4476c
	subcc	%o2, 1, %o2		! done yet?
Packit c4476c
	ldd	[%o1 + 0x38], %f22
Packit c4476c
	add	%o1, 0x40, %o1
Packit c4476c
	prefetch [%o1 + 63], 20
Packit c4476c
Packit c4476c
	.word	0x81b02840		! SHA256
Packit c4476c
Packit c4476c
	bne,pt	SIZE_T_CC, .Lhwloop
Packit c4476c
	nop
Packit c4476c
Packit c4476c
.Lhwfinish:
Packit c4476c
	st	%f0, [%o0 + 0x00]	! store context
Packit c4476c
	st	%f1, [%o0 + 0x04]
Packit c4476c
	st	%f2, [%o0 + 0x08]
Packit c4476c
	st	%f3, [%o0 + 0x0c]
Packit c4476c
	st	%f4, [%o0 + 0x10]
Packit c4476c
	st	%f5, [%o0 + 0x14]
Packit c4476c
	st	%f6, [%o0 + 0x18]
Packit c4476c
	retl
Packit c4476c
	 st	%f7, [%o0 + 0x1c]
Packit c4476c
Packit c4476c
.align	8
Packit c4476c
.Lhwunaligned:
Packit c4476c
	alignaddr %o1, %g0, %o1
Packit c4476c
Packit c4476c
	ldd	[%o1 + 0x00], %f10
Packit c4476c
.Lhwunaligned_loop:
Packit c4476c
	ldd	[%o1 + 0x08], %f12
Packit c4476c
	ldd	[%o1 + 0x10], %f14
Packit c4476c
	ldd	[%o1 + 0x18], %f16
Packit c4476c
	ldd	[%o1 + 0x20], %f18
Packit c4476c
	ldd	[%o1 + 0x28], %f20
Packit c4476c
	ldd	[%o1 + 0x30], %f22
Packit c4476c
	ldd	[%o1 + 0x38], %f24
Packit c4476c
	subcc	%o2, 1, %o2		! done yet?
Packit c4476c
	ldd	[%o1 + 0x40], %f26
Packit c4476c
	add	%o1, 0x40, %o1
Packit c4476c
	prefetch [%o1 + 63], 20
Packit c4476c
Packit c4476c
	faligndata %f10, %f12, %f8
Packit c4476c
	faligndata %f12, %f14, %f10
Packit c4476c
	faligndata %f14, %f16, %f12
Packit c4476c
	faligndata %f16, %f18, %f14
Packit c4476c
	faligndata %f18, %f20, %f16
Packit c4476c
	faligndata %f20, %f22, %f18
Packit c4476c
	faligndata %f22, %f24, %f20
Packit c4476c
	faligndata %f24, %f26, %f22
Packit c4476c
Packit c4476c
	.word	0x81b02840		! SHA256
Packit c4476c
Packit c4476c
	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
Packit c4476c
	for	%f26, %f26, %f10	! %f10=%f26
Packit c4476c
Packit c4476c
	ba	.Lhwfinish
Packit c4476c
	nop
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
.align	16
Packit c4476c
.Lsoftware:
Packit c4476c
	save	%sp,-STACK_FRAME-$locals,%sp
Packit c4476c
	and	$inp,`$align-1`,$tmp31
Packit c4476c
	sllx	$len,`log(16*$SZ)/log(2)`,$len
Packit c4476c
	andn	$inp,`$align-1`,$inp
Packit c4476c
	sll	$tmp31,3,$tmp31
Packit c4476c
	add	$inp,$len,$len
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($SZ==8); # SHA512
Packit c4476c
	mov	32,$tmp32
Packit c4476c
	sub	$tmp32,$tmp31,$tmp32
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
.Lpic:	call	.+8
Packit c4476c
	add	%o7,K${label}-.Lpic,$Ktbl
Packit c4476c
Packit c4476c
	$LD	[$ctx+`0*$SZ`],$A
Packit c4476c
	$LD	[$ctx+`1*$SZ`],$B
Packit c4476c
	$LD	[$ctx+`2*$SZ`],$C
Packit c4476c
	$LD	[$ctx+`3*$SZ`],$D
Packit c4476c
	$LD	[$ctx+`4*$SZ`],$E
Packit c4476c
	$LD	[$ctx+`5*$SZ`],$F
Packit c4476c
	$LD	[$ctx+`6*$SZ`],$G
Packit c4476c
	$LD	[$ctx+`7*$SZ`],$H
Packit c4476c
Packit c4476c
.Lloop:
Packit c4476c
___
Packit c4476c
for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
Packit c4476c
$code.=".L16_xx:\n";
Packit c4476c
for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
Packit c4476c
$code.=<<___;
Packit c4476c
	and	$tmp2,0xfff,$tmp2
Packit c4476c
	cmp	$tmp2,$lastK
Packit c4476c
	bne	.L16_xx
Packit c4476c
	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
Packit c4476c
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($SZ==4); # SHA256
Packit c4476c
	$LD	[$ctx+`0*$SZ`],@X[0]
Packit c4476c
	$LD	[$ctx+`1*$SZ`],@X[1]
Packit c4476c
	$LD	[$ctx+`2*$SZ`],@X[2]
Packit c4476c
	$LD	[$ctx+`3*$SZ`],@X[3]
Packit c4476c
	$LD	[$ctx+`4*$SZ`],@X[4]
Packit c4476c
	$LD	[$ctx+`5*$SZ`],@X[5]
Packit c4476c
	$LD	[$ctx+`6*$SZ`],@X[6]
Packit c4476c
	$LD	[$ctx+`7*$SZ`],@X[7]
Packit c4476c
Packit c4476c
	add	$A,@X[0],$A
Packit c4476c
	$ST	$A,[$ctx+`0*$SZ`]
Packit c4476c
	add	$B,@X[1],$B
Packit c4476c
	$ST	$B,[$ctx+`1*$SZ`]
Packit c4476c
	add	$C,@X[2],$C
Packit c4476c
	$ST	$C,[$ctx+`2*$SZ`]
Packit c4476c
	add	$D,@X[3],$D
Packit c4476c
	$ST	$D,[$ctx+`3*$SZ`]
Packit c4476c
	add	$E,@X[4],$E
Packit c4476c
	$ST	$E,[$ctx+`4*$SZ`]
Packit c4476c
	add	$F,@X[5],$F
Packit c4476c
	$ST	$F,[$ctx+`5*$SZ`]
Packit c4476c
	add	$G,@X[6],$G
Packit c4476c
	$ST	$G,[$ctx+`6*$SZ`]
Packit c4476c
	add	$H,@X[7],$H
Packit c4476c
	$ST	$H,[$ctx+`7*$SZ`]
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($SZ==8); # SHA512
Packit c4476c
	ld	[$ctx+`0*$SZ+0`],%l0
Packit c4476c
	ld	[$ctx+`0*$SZ+4`],%l1
Packit c4476c
	ld	[$ctx+`1*$SZ+0`],%l2
Packit c4476c
	ld	[$ctx+`1*$SZ+4`],%l3
Packit c4476c
	ld	[$ctx+`2*$SZ+0`],%l4
Packit c4476c
	ld	[$ctx+`2*$SZ+4`],%l5
Packit c4476c
	ld	[$ctx+`3*$SZ+0`],%l6
Packit c4476c
Packit c4476c
	sllx	%l0,32,$tmp0
Packit c4476c
	ld	[$ctx+`3*$SZ+4`],%l7
Packit c4476c
	sllx	%l2,32,$tmp1
Packit c4476c
	or	%l1,$tmp0,$tmp0
Packit c4476c
	or	%l3,$tmp1,$tmp1
Packit c4476c
	add	$tmp0,$A,$A
Packit c4476c
	add	$tmp1,$B,$B
Packit c4476c
	$ST	$A,[$ctx+`0*$SZ`]
Packit c4476c
	sllx	%l4,32,$tmp2
Packit c4476c
	$ST	$B,[$ctx+`1*$SZ`]
Packit c4476c
	sllx	%l6,32,$T1
Packit c4476c
	or	%l5,$tmp2,$tmp2
Packit c4476c
	or	%l7,$T1,$T1
Packit c4476c
	add	$tmp2,$C,$C
Packit c4476c
	$ST	$C,[$ctx+`2*$SZ`]
Packit c4476c
	add	$T1,$D,$D
Packit c4476c
	$ST	$D,[$ctx+`3*$SZ`]
Packit c4476c
Packit c4476c
	ld	[$ctx+`4*$SZ+0`],%l0
Packit c4476c
	ld	[$ctx+`4*$SZ+4`],%l1
Packit c4476c
	ld	[$ctx+`5*$SZ+0`],%l2
Packit c4476c
	ld	[$ctx+`5*$SZ+4`],%l3
Packit c4476c
	ld	[$ctx+`6*$SZ+0`],%l4
Packit c4476c
	ld	[$ctx+`6*$SZ+4`],%l5
Packit c4476c
	ld	[$ctx+`7*$SZ+0`],%l6
Packit c4476c
Packit c4476c
	sllx	%l0,32,$tmp0
Packit c4476c
	ld	[$ctx+`7*$SZ+4`],%l7
Packit c4476c
	sllx	%l2,32,$tmp1
Packit c4476c
	or	%l1,$tmp0,$tmp0
Packit c4476c
	or	%l3,$tmp1,$tmp1
Packit c4476c
	add	$tmp0,$E,$E
Packit c4476c
	add	$tmp1,$F,$F
Packit c4476c
	$ST	$E,[$ctx+`4*$SZ`]
Packit c4476c
	sllx	%l4,32,$tmp2
Packit c4476c
	$ST	$F,[$ctx+`5*$SZ`]
Packit c4476c
	sllx	%l6,32,$T1
Packit c4476c
	or	%l5,$tmp2,$tmp2
Packit c4476c
	or	%l7,$T1,$T1
Packit c4476c
	add	$tmp2,$G,$G
Packit c4476c
	$ST	$G,[$ctx+`6*$SZ`]
Packit c4476c
	add	$T1,$H,$H
Packit c4476c
	$ST	$H,[$ctx+`7*$SZ`]
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	add	$inp,`16*$SZ`,$inp		! advance inp
Packit c4476c
	cmp	$inp,$len
Packit c4476c
	bne	SIZE_T_CC,.Lloop
Packit c4476c
	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
Packit c4476c
Packit c4476c
	ret
Packit c4476c
	restore
Packit c4476c
.type	sha${label}_block_data_order,#function
Packit c4476c
.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
Packit c4476c
.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
.align	4
Packit c4476c
___
Packit c4476c
Packit c4476c
# Purpose of these subroutines is to explicitly encode VIS instructions,
Packit c4476c
# so that one can compile the module without having to specify VIS
Packit c4476c
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
Packit c4476c
# Idea is to reserve for option to produce "universal" binary and let
Packit c4476c
# programmer detect if current CPU is VIS capable at run-time.
Packit c4476c
sub unvis {
Packit c4476c
my ($mnemonic,$rs1,$rs2,$rd)=@_;
Packit c4476c
my $ref,$opf;
Packit c4476c
my %visopf = (	"faligndata"	=> 0x048,
Packit c4476c
		"for"		=> 0x07c	);
Packit c4476c
Packit c4476c
    $ref = "$mnemonic\t$rs1,$rs2,$rd";
Packit c4476c
Packit c4476c
    if ($opf=$visopf{$mnemonic}) {
Packit c4476c
	foreach ($rs1,$rs2,$rd) {
Packit c4476c
	    return $ref if (!/%f([0-9]{1,2})/);
Packit c4476c
	    $_=$1;
Packit c4476c
	    if ($1>=32) {
Packit c4476c
		return $ref if ($1&1;;
Packit c4476c
		# re-encode for upper double register addressing
Packit c4476c
		$_=($1|$1>>5)&3;;
Packit c4476c
	    }
Packit c4476c
	}
Packit c4476c
Packit c4476c
	return	sprintf ".word\t0x%08x !%s",
Packit c4476c
			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
Packit c4476c
			$ref;
Packit c4476c
    } else {
Packit c4476c
	return $ref;
Packit c4476c
    }
Packit c4476c
}
Packit c4476c
sub unalignaddr {
Packit c4476c
my ($mnemonic,$rs1,$rs2,$rd)=@_;
Packit c4476c
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
Packit c4476c
my $ref="$mnemonic\t$rs1,$rs2,$rd";
Packit c4476c
Packit c4476c
    foreach ($rs1,$rs2,$rd) {
Packit c4476c
	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
Packit c4476c
	else			{ return $ref; }
Packit c4476c
    }
Packit c4476c
    return  sprintf ".word\t0x%08x !%s",
Packit c4476c
		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
Packit c4476c
		    $ref;
Packit c4476c
}
Packit c4476c
Packit c4476c
foreach (split("\n",$code)) {
Packit c4476c
	s/\`([^\`]*)\`/eval $1/ge;
Packit c4476c
Packit c4476c
	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
Packit c4476c
		&unvis($1,$2,$3,$4)
Packit c4476c
	 /ge;
Packit c4476c
	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
Packit c4476c
		&unalignaddr($1,$2,$3,$4)
Packit c4476c
	 /ge;
Packit c4476c
Packit c4476c
	print $_,"\n";
Packit c4476c
}
Packit c4476c
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";