Blame crypto/sha/asm/sha256-c64xplus.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# SHA256 for C64x+.
Packit c4476c
#
Packit c4476c
# January 2012
Packit c4476c
#
Packit c4476c
# Performance is just below 10 cycles per processed byte, which is
Packit c4476c
# almost 40% faster than compiler-generated code. Unroll is unlikely
Packit c4476c
# to give more than ~8% improvement...
Packit c4476c
#
Packit c4476c
# !!! Note that this module uses AMR, which means that all interrupt
Packit c4476c
# service routines are expected to preserve it and for own well-being
Packit c4476c
# zero it upon entry.
Packit c4476c
Packit c4476c
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
Packit c4476c
open STDOUT,">$output";
Packit c4476c
Packit c4476c
($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
Packit c4476c
 $K256="A3";
Packit c4476c
Packit c4476c
($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
Packit c4476c
	=map("A$_",(16..31));
Packit c4476c
($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
Packit c4476c
	=map("B$_",(16..31));
Packit c4476c
Packit c4476c
($Xia,$Xib)=("A5","B5");			# circular/ring buffer
Packit c4476c
 $CTXB=$t2e;
Packit c4476c
Packit c4476c
($Xn,$X0,$K)=("B7","B8","B9");
Packit c4476c
($Maj,$Ch)=($T2,"B6");
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
	.text
Packit c4476c
Packit c4476c
	.if	.ASSEMBLER_VERSION<7000000
Packit c4476c
	.asg	0,__TI_EABI__
Packit c4476c
	.endif
Packit c4476c
	.if	__TI_EABI__
Packit c4476c
	.nocmp
Packit c4476c
	.asg	sha256_block_data_order,_sha256_block_data_order
Packit c4476c
	.endif
Packit c4476c
Packit c4476c
	.asg	B3,RA
Packit c4476c
	.asg	A15,FP
Packit c4476c
	.asg	B15,SP
Packit c4476c
Packit c4476c
	.if	.BIG_ENDIAN
Packit c4476c
	.asg	SWAP2,MV
Packit c4476c
	.asg	SWAP4,MV
Packit c4476c
	.endif
Packit c4476c
Packit c4476c
	.global	_sha256_block_data_order
Packit c4476c
_sha256_block_data_order:
Packit c4476c
__sha256_block:
Packit c4476c
	.asmfunc stack_usage(64)
Packit c4476c
	MV	$NUM,A0				; reassign $NUM
Packit c4476c
||	MVK	-64,B0
Packit c4476c
  [!A0]	BNOP	RA				; if ($NUM==0) return;
Packit c4476c
|| [A0]	STW	FP,*SP--[16]			; save frame pointer and alloca(64)
Packit c4476c
|| [A0]	MV	SP,FP
Packit c4476c
   [A0]	ADDKPC	__sha256_block,B2
Packit c4476c
|| [A0]	AND	B0,SP,SP			; align stack at 64 bytes
Packit c4476c
	.if	__TI_EABI__
Packit c4476c
   [A0]	MVK	0x00404,B1
Packit c4476c
|| [A0]	MVKL	\$PCR_OFFSET(K256,__sha256_block),$K256
Packit c4476c
   [A0]	MVKH	0x50000,B1
Packit c4476c
|| [A0]	MVKH	\$PCR_OFFSET(K256,__sha256_block),$K256
Packit c4476c
	.else
Packit c4476c
   [A0]	MVK	0x00404,B1
Packit c4476c
|| [A0]	MVKL	(K256-__sha256_block),$K256
Packit c4476c
   [A0]	MVKH	0x50000,B1
Packit c4476c
|| [A0]	MVKH	(K256-__sha256_block),$K256
Packit c4476c
	.endif
Packit c4476c
   [A0]	MVC	B1,AMR				; setup circular addressing
Packit c4476c
|| [A0]	MV	SP,$Xia
Packit c4476c
   [A0]	MV	SP,$Xib
Packit c4476c
|| [A0]	ADD	B2,$K256,$K256
Packit c4476c
|| [A0]	MV	$CTXA,$CTXB
Packit c4476c
|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
Packit c4476c
	LDW	*${CTXA}[0],$A			; load ctx
Packit c4476c
||	LDW	*${CTXB}[4],$E
Packit c4476c
	LDW	*${CTXA}[1],$B
Packit c4476c
||	LDW	*${CTXB}[5],$F
Packit c4476c
	LDW	*${CTXA}[2],$C
Packit c4476c
||	LDW	*${CTXB}[6],$G
Packit c4476c
	LDW	*${CTXA}[3],$D
Packit c4476c
||	LDW	*${CTXB}[7],$H
Packit c4476c
Packit c4476c
	LDNW	*$INP++,$Xn			; pre-fetch input
Packit c4476c
	LDW	*$K256++,$K			; pre-fetch K256[0]
Packit c4476c
	MVK	14,B0				; loop counters
Packit c4476c
	MVK	47,B1
Packit c4476c
||	ADDAW	$Xia,9,$Xia
Packit c4476c
outerloop?:
Packit c4476c
	SUB	A0,1,A0
Packit c4476c
||	MV	$A,$Actx
Packit c4476c
||	MV	$E,$Ectx
Packit c4476c
||	MVD	$B,$Bctx
Packit c4476c
||	MVD	$F,$Fctx
Packit c4476c
	MV	$C,$Cctx
Packit c4476c
||	MV	$G,$Gctx
Packit c4476c
||	MVD	$D,$Dctx
Packit c4476c
||	MVD	$H,$Hctx
Packit c4476c
||	SWAP4	$Xn,$X0
Packit c4476c
Packit c4476c
	SPLOOPD	8				; BODY_00_14
Packit c4476c
||	MVC	B0,ILC
Packit c4476c
||	SWAP2	$X0,$X0
Packit c4476c
Packit c4476c
	LDNW	*$INP++,$Xn
Packit c4476c
||	ROTL	$A,30,$S0
Packit c4476c
||	OR	$A,$B,$Maj
Packit c4476c
||	AND	$A,$B,$t2a
Packit c4476c
||	ROTL	$E,26,$S1
Packit c4476c
||	AND	$F,$E,$Ch
Packit c4476c
||	ANDN	$G,$E,$t2e
Packit c4476c
	ROTL	$A,19,$t0a
Packit c4476c
||	AND	$C,$Maj,$Maj
Packit c4476c
||	ROTL	$E,21,$t0e
Packit c4476c
||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
Packit c4476c
	ROTL	$A,10,$t1a
Packit c4476c
||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
Packit c4476c
||	ROTL	$E,7,$t1e
Packit c4476c
||	ADD	$K,$H,$T1			; T1 = h + K256[i]
Packit c4476c
	ADD	$X0,$T1,$T1			; T1 += X[i];
Packit c4476c
||	STW	$X0,*$Xib++
Packit c4476c
||	XOR	$t0a,$S0,$S0
Packit c4476c
||	XOR	$t0e,$S1,$S1
Packit c4476c
	XOR	$t1a,$S0,$S0			; Sigma0(a)
Packit c4476c
||	XOR	$t1e,$S1,$S1			; Sigma1(e)
Packit c4476c
||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
Packit c4476c
||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
Packit c4476c
	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
Packit c4476c
||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
Packit c4476c
||	ROTL	$G,0,$H				; h = g
Packit c4476c
||	MV	$F,$G				; g = f
Packit c4476c
||	MV	$X0,$X14
Packit c4476c
||	SWAP4	$Xn,$X0
Packit c4476c
	SWAP2	$X0,$X0
Packit c4476c
||	MV	$E,$F				; f = e
Packit c4476c
||	ADD	$D,$T1,$E			; e = d + T1
Packit c4476c
||	MV	$C,$D				; d = c
Packit c4476c
	MV	$B,$C				; c = b
Packit c4476c
||	MV	$A,$B				; b = a
Packit c4476c
||	ADD	$T1,$T2,$A			; a = T1 + T2
Packit c4476c
	SPKERNEL
Packit c4476c
Packit c4476c
	ROTL	$A,30,$S0			; BODY_15
Packit c4476c
||	OR	$A,$B,$Maj
Packit c4476c
||	AND	$A,$B,$t2a
Packit c4476c
||	ROTL	$E,26,$S1
Packit c4476c
||	AND	$F,$E,$Ch
Packit c4476c
||	ANDN	$G,$E,$t2e
Packit c4476c
||	LDW	*${Xib}[1],$Xn			; modulo-scheduled
Packit c4476c
	ROTL	$A,19,$t0a
Packit c4476c
||	AND	$C,$Maj,$Maj
Packit c4476c
||	ROTL	$E,21,$t0e
Packit c4476c
||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
Packit c4476c
||	LDW	*${Xib}[2],$X1			; modulo-scheduled
Packit c4476c
	ROTL	$A,10,$t1a
Packit c4476c
||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
Packit c4476c
||	ROTL	$E,7,$t1e
Packit c4476c
||	ADD	$K,$H,$T1			; T1 = h + K256[i]
Packit c4476c
	ADD	$X0,$T1,$T1			; T1 += X[i];
Packit c4476c
||	STW	$X0,*$Xib++
Packit c4476c
||	XOR	$t0a,$S0,$S0
Packit c4476c
||	XOR	$t0e,$S1,$S1
Packit c4476c
	XOR	$t1a,$S0,$S0			; Sigma0(a)
Packit c4476c
||	XOR	$t1e,$S1,$S1			; Sigma1(e)
Packit c4476c
||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
Packit c4476c
||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
Packit c4476c
	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
Packit c4476c
||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
Packit c4476c
||	ROTL	$G,0,$H				; h = g
Packit c4476c
||	MV	$F,$G				; g = f
Packit c4476c
||	MV	$X0,$X15
Packit c4476c
	MV	$E,$F				; f = e
Packit c4476c
||	ADD	$D,$T1,$E			; e = d + T1
Packit c4476c
||	MV	$C,$D				; d = c
Packit c4476c
||	MV	$Xn,$X0				; modulo-scheduled
Packit c4476c
||	LDW	*$Xia,$X9			; modulo-scheduled
Packit c4476c
||	ROTL	$X1,25,$t0e			; modulo-scheduled
Packit c4476c
||	ROTL	$X14,15,$t0a			; modulo-scheduled
Packit c4476c
	SHRU	$X1,3,$s0			; modulo-scheduled
Packit c4476c
||	SHRU	$X14,10,$s1			; modulo-scheduled
Packit c4476c
||	ROTL	$B,0,$C				; c = b
Packit c4476c
||	MV	$A,$B				; b = a
Packit c4476c
||	ADD	$T1,$T2,$A			; a = T1 + T2
Packit c4476c
Packit c4476c
	SPLOOPD	10				; BODY_16_63
Packit c4476c
||	MVC	B1,ILC
Packit c4476c
||	ROTL	$X1,14,$t1e			; modulo-scheduled
Packit c4476c
||	ROTL	$X14,13,$t1a			; modulo-scheduled
Packit c4476c
Packit c4476c
	XOR	$t0e,$s0,$s0
Packit c4476c
||	XOR	$t0a,$s1,$s1
Packit c4476c
||	MV	$X15,$X14
Packit c4476c
||	MV	$X1,$Xn
Packit c4476c
	XOR	$t1e,$s0,$s0			; sigma0(X[i+1])
Packit c4476c
||	XOR	$t1a,$s1,$s1			; sigma1(X[i+14])
Packit c4476c
||	LDW	*${Xib}[2],$X1			; module-scheduled
Packit c4476c
	ROTL	$A,30,$S0
Packit c4476c
||	OR	$A,$B,$Maj
Packit c4476c
||	AND	$A,$B,$t2a
Packit c4476c
||	ROTL	$E,26,$S1
Packit c4476c
||	AND	$F,$E,$Ch
Packit c4476c
||	ANDN	$G,$E,$t2e
Packit c4476c
||	ADD	$X9,$X0,$X0			; X[i] += X[i+9]
Packit c4476c
	ROTL	$A,19,$t0a
Packit c4476c
||	AND	$C,$Maj,$Maj
Packit c4476c
||	ROTL	$E,21,$t0e
Packit c4476c
||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
Packit c4476c
||	ADD	$s0,$X0,$X0			; X[i] += sigma1(X[i+1])
Packit c4476c
	ROTL	$A,10,$t1a
Packit c4476c
||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
Packit c4476c
||	ROTL	$E,7,$t1e
Packit c4476c
||	ADD	$H,$K,$T1			; T1 = h + K256[i]
Packit c4476c
||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
Packit c4476c
	XOR	$t0a,$S0,$S0
Packit c4476c
||	XOR	$t0e,$S1,$S1
Packit c4476c
||	ADD	$X0,$T1,$T1			; T1 += X[i]
Packit c4476c
||	STW	$X0,*$Xib++
Packit c4476c
	XOR	$t1a,$S0,$S0			; Sigma0(a)
Packit c4476c
||	XOR	$t1e,$S1,$S1			; Sigma1(e)
Packit c4476c
||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
Packit c4476c
||	MV	$X0,$X15
Packit c4476c
||	ROTL	$G,0,$H				; h = g
Packit c4476c
||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
Packit c4476c
	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
Packit c4476c
||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
Packit c4476c
||	MV	$F,$G				; g = f
Packit c4476c
||	MV	$Xn,$X0				; modulo-scheduled
Packit c4476c
||	LDW	*++$Xia,$X9			; modulo-scheduled
Packit c4476c
||	ROTL	$X1,25,$t0e			; module-scheduled
Packit c4476c
||	ROTL	$X14,15,$t0a			; modulo-scheduled
Packit c4476c
	ROTL	$X1,14,$t1e			; modulo-scheduled
Packit c4476c
||	ROTL	$X14,13,$t1a			; modulo-scheduled
Packit c4476c
||	MV	$E,$F				; f = e
Packit c4476c
||	ADD	$D,$T1,$E			; e = d + T1
Packit c4476c
||	MV	$C,$D				; d = c
Packit c4476c
||	MV	$B,$C				; c = b
Packit c4476c
	MV	$A,$B				; b = a
Packit c4476c
||	ADD	$T1,$T2,$A			; a = T1 + T2
Packit c4476c
||	SHRU	$X1,3,$s0			; modulo-scheduled
Packit c4476c
||	SHRU	$X14,10,$s1			; modulo-scheduled
Packit c4476c
	SPKERNEL
Packit c4476c
Packit c4476c
   [A0]	B	outerloop?
Packit c4476c
|| [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
Packit c4476c
|| [A0]	ADDK	-260,$K256			; rewind K256
Packit c4476c
||	ADD	$Actx,$A,$A			; accumulate ctx
Packit c4476c
||	ADD	$Ectx,$E,$E
Packit c4476c
||	ADD	$Bctx,$B,$B
Packit c4476c
	ADD	$Fctx,$F,$F
Packit c4476c
||	ADD	$Cctx,$C,$C
Packit c4476c
||	ADD	$Gctx,$G,$G
Packit c4476c
||	ADD	$Dctx,$D,$D
Packit c4476c
||	ADD	$Hctx,$H,$H
Packit c4476c
|| [A0]	LDW	*$K256++,$K			; pre-fetch K256[0]
Packit c4476c
Packit c4476c
  [!A0]	BNOP	RA
Packit c4476c
||[!A0]	MV	$CTXA,$CTXB
Packit c4476c
  [!A0]	MV	FP,SP				; restore stack pointer
Packit c4476c
||[!A0]	LDW	*FP[0],FP			; restore frame pointer
Packit c4476c
  [!A0]	STW	$A,*${CTXA}[0]  		; save ctx
Packit c4476c
||[!A0]	STW	$E,*${CTXB}[4]
Packit c4476c
||[!A0]	MVK	0,B0
Packit c4476c
  [!A0]	STW	$B,*${CTXA}[1]
Packit c4476c
||[!A0]	STW	$F,*${CTXB}[5]
Packit c4476c
||[!A0]	MVC	B0,AMR				; clear AMR
Packit c4476c
	STW	$C,*${CTXA}[2]
Packit c4476c
||	STW	$G,*${CTXB}[6]
Packit c4476c
	STW	$D,*${CTXA}[3]
Packit c4476c
||	STW	$H,*${CTXB}[7]
Packit c4476c
	.endasmfunc
Packit c4476c
Packit c4476c
	.if	__TI_EABI__
Packit c4476c
	.sect	".text:sha_asm.const"
Packit c4476c
	.else
Packit c4476c
	.sect	".const:sha_asm"
Packit c4476c
	.endif
Packit c4476c
	.align	128
Packit c4476c
K256:
Packit c4476c
	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
Packit c4476c
	.uword	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
Packit c4476c
	.uword	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
Packit c4476c
	.uword	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
Packit c4476c
	.uword	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
Packit c4476c
	.uword	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
Packit c4476c
	.uword	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
Packit c4476c
	.uword	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
Packit c4476c
	.uword	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
Packit c4476c
	.uword	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
Packit c4476c
	.uword	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
Packit c4476c
	.uword	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
Packit c4476c
	.uword	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
Packit c4476c
	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
Packit c4476c
	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
Packit c4476c
	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
Packit c4476c
	.cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
	.align	4
Packit c4476c
Packit c4476c
___
Packit c4476c
Packit c4476c
print $code;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";