Blame crypto/bn/asm/alpha-mont.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
Packit c4476c
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
Packit c4476c
# instructed to '-tune host' code with in-line assembler. Other
Packit c4476c
# benchmarks improve by 15-20%. To anchor it to something else, the
Packit c4476c
# code provides approximately the same performance per GHz as AMD64.
Packit c4476c
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
Packit c4476c
# difference.
Packit c4476c
Packit c4476c
$output=pop;
Packit c4476c
open STDOUT,">$output";
Packit c4476c
Packit c4476c
# int bn_mul_mont(
Packit c4476c
$rp="a0";	# BN_ULONG *rp,
Packit c4476c
$ap="a1";	# const BN_ULONG *ap,
Packit c4476c
$bp="a2";	# const BN_ULONG *bp,
Packit c4476c
$np="a3";	# const BN_ULONG *np,
Packit c4476c
$n0="a4";	# const BN_ULONG *n0,
Packit c4476c
$num="a5";	# int num);
Packit c4476c
Packit c4476c
$lo0="t0";
Packit c4476c
$hi0="t1";
Packit c4476c
$lo1="t2";
Packit c4476c
$hi1="t3";
Packit c4476c
$aj="t4";
Packit c4476c
$bi="t5";
Packit c4476c
$nj="t6";
Packit c4476c
$tp="t7";
Packit c4476c
$alo="t8";
Packit c4476c
$ahi="t9";
Packit c4476c
$nlo="t10";
Packit c4476c
$nhi="t11";
Packit c4476c
$tj="t12";
Packit c4476c
$i="s3";
Packit c4476c
$j="s4";
Packit c4476c
$m1="s5";
Packit c4476c
Packit c4476c
$code=<<___;
Packit c4476c
#ifdef __linux__
Packit c4476c
#include <asm/regdef.h>
Packit c4476c
#else
Packit c4476c
#include <asm.h>
Packit c4476c
#include <regdef.h>
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.text
Packit c4476c
Packit c4476c
.set	noat
Packit c4476c
.set	noreorder
Packit c4476c
Packit c4476c
.globl	bn_mul_mont
Packit c4476c
.align	5
Packit c4476c
.ent	bn_mul_mont
Packit c4476c
bn_mul_mont:
Packit c4476c
	lda	sp,-48(sp)
Packit c4476c
	stq	ra,0(sp)
Packit c4476c
	stq	s3,8(sp)
Packit c4476c
	stq	s4,16(sp)
Packit c4476c
	stq	s5,24(sp)
Packit c4476c
	stq	fp,32(sp)
Packit c4476c
	mov	sp,fp
Packit c4476c
	.mask	0x0400f000,-48
Packit c4476c
	.frame	fp,48,ra
Packit c4476c
	.prologue 0
Packit c4476c
Packit c4476c
	.align	4
Packit c4476c
	.set	reorder
Packit c4476c
	sextl	$num,$num
Packit c4476c
	mov	0,v0
Packit c4476c
	cmplt	$num,4,AT
Packit c4476c
	bne	AT,.Lexit
Packit c4476c
Packit c4476c
	ldq	$hi0,0($ap)	# ap[0]
Packit c4476c
	s8addq	$num,16,AT
Packit c4476c
	ldq	$aj,8($ap)
Packit c4476c
	subq	sp,AT,sp
Packit c4476c
	ldq	$bi,0($bp)	# bp[0]
Packit c4476c
	lda	AT,-4096(zero)	# mov	-4096,AT
Packit c4476c
	ldq	$n0,0($n0)
Packit c4476c
	and	sp,AT,sp
Packit c4476c
Packit c4476c
	mulq	$hi0,$bi,$lo0
Packit c4476c
	ldq	$hi1,0($np)	# np[0]
Packit c4476c
	umulh	$hi0,$bi,$hi0
Packit c4476c
	ldq	$nj,8($np)
Packit c4476c
Packit c4476c
	mulq	$lo0,$n0,$m1
Packit c4476c
Packit c4476c
	mulq	$hi1,$m1,$lo1
Packit c4476c
	umulh	$hi1,$m1,$hi1
Packit c4476c
Packit c4476c
	addq	$lo1,$lo0,$lo1
Packit c4476c
	cmpult	$lo1,$lo0,AT
Packit c4476c
	addq	$hi1,AT,$hi1
Packit c4476c
Packit c4476c
	mulq	$aj,$bi,$alo
Packit c4476c
	mov	2,$j
Packit c4476c
	umulh	$aj,$bi,$ahi
Packit c4476c
	mov	sp,$tp
Packit c4476c
Packit c4476c
	mulq	$nj,$m1,$nlo
Packit c4476c
	s8addq	$j,$ap,$aj
Packit c4476c
	umulh	$nj,$m1,$nhi
Packit c4476c
	s8addq	$j,$np,$nj
Packit c4476c
.align	4
Packit c4476c
.L1st:
Packit c4476c
	.set	noreorder
Packit c4476c
	ldq	$aj,0($aj)
Packit c4476c
	addl	$j,1,$j
Packit c4476c
	ldq	$nj,0($nj)
Packit c4476c
	lda	$tp,8($tp)
Packit c4476c
Packit c4476c
	addq	$alo,$hi0,$lo0
Packit c4476c
	mulq	$aj,$bi,$alo
Packit c4476c
	cmpult	$lo0,$hi0,AT
Packit c4476c
	addq	$nlo,$hi1,$lo1
Packit c4476c
Packit c4476c
	mulq	$nj,$m1,$nlo
Packit c4476c
	addq	$ahi,AT,$hi0
Packit c4476c
	cmpult	$lo1,$hi1,v0
Packit c4476c
	cmplt	$j,$num,$tj
Packit c4476c
Packit c4476c
	umulh	$aj,$bi,$ahi
Packit c4476c
	addq	$nhi,v0,$hi1
Packit c4476c
	addq	$lo1,$lo0,$lo1
Packit c4476c
	s8addq	$j,$ap,$aj
Packit c4476c
Packit c4476c
	umulh	$nj,$m1,$nhi
Packit c4476c
	cmpult	$lo1,$lo0,v0
Packit c4476c
	addq	$hi1,v0,$hi1
Packit c4476c
	s8addq	$j,$np,$nj
Packit c4476c
Packit c4476c
	stq	$lo1,-8($tp)
Packit c4476c
	nop
Packit c4476c
	unop
Packit c4476c
	bne	$tj,.L1st
Packit c4476c
	.set	reorder
Packit c4476c
Packit c4476c
	addq	$alo,$hi0,$lo0
Packit c4476c
	addq	$nlo,$hi1,$lo1
Packit c4476c
	cmpult	$lo0,$hi0,AT
Packit c4476c
	cmpult	$lo1,$hi1,v0
Packit c4476c
	addq	$ahi,AT,$hi0
Packit c4476c
	addq	$nhi,v0,$hi1
Packit c4476c
Packit c4476c
	addq	$lo1,$lo0,$lo1
Packit c4476c
	cmpult	$lo1,$lo0,v0
Packit c4476c
	addq	$hi1,v0,$hi1
Packit c4476c
Packit c4476c
	stq	$lo1,0($tp)
Packit c4476c
Packit c4476c
	addq	$hi1,$hi0,$hi1
Packit c4476c
	cmpult	$hi1,$hi0,AT
Packit c4476c
	stq	$hi1,8($tp)
Packit c4476c
	stq	AT,16($tp)
Packit c4476c
Packit c4476c
	mov	1,$i
Packit c4476c
.align	4
Packit c4476c
.Louter:
Packit c4476c
	s8addq	$i,$bp,$bi
Packit c4476c
	ldq	$hi0,0($ap)
Packit c4476c
	ldq	$aj,8($ap)
Packit c4476c
	ldq	$bi,0($bi)
Packit c4476c
	ldq	$hi1,0($np)
Packit c4476c
	ldq	$nj,8($np)
Packit c4476c
	ldq	$tj,0(sp)
Packit c4476c
Packit c4476c
	mulq	$hi0,$bi,$lo0
Packit c4476c
	umulh	$hi0,$bi,$hi0
Packit c4476c
Packit c4476c
	addq	$lo0,$tj,$lo0
Packit c4476c
	cmpult	$lo0,$tj,AT
Packit c4476c
	addq	$hi0,AT,$hi0
Packit c4476c
Packit c4476c
	mulq	$lo0,$n0,$m1
Packit c4476c
Packit c4476c
	mulq	$hi1,$m1,$lo1
Packit c4476c
	umulh	$hi1,$m1,$hi1
Packit c4476c
Packit c4476c
	addq	$lo1,$lo0,$lo1
Packit c4476c
	cmpult	$lo1,$lo0,AT
Packit c4476c
	mov	2,$j
Packit c4476c
	addq	$hi1,AT,$hi1
Packit c4476c
Packit c4476c
	mulq	$aj,$bi,$alo
Packit c4476c
	mov	sp,$tp
Packit c4476c
	umulh	$aj,$bi,$ahi
Packit c4476c
Packit c4476c
	mulq	$nj,$m1,$nlo
Packit c4476c
	s8addq	$j,$ap,$aj
Packit c4476c
	umulh	$nj,$m1,$nhi
Packit c4476c
.align	4
Packit c4476c
.Linner:
Packit c4476c
	.set	noreorder
Packit c4476c
	ldq	$tj,8($tp)	#L0
Packit c4476c
	nop			#U1
Packit c4476c
	ldq	$aj,0($aj)	#L1
Packit c4476c
	s8addq	$j,$np,$nj	#U0
Packit c4476c
Packit c4476c
	ldq	$nj,0($nj)	#L0
Packit c4476c
	nop			#U1
Packit c4476c
	addq	$alo,$hi0,$lo0	#L1
Packit c4476c
	lda	$tp,8($tp)
Packit c4476c
Packit c4476c
	mulq	$aj,$bi,$alo	#U1
Packit c4476c
	cmpult	$lo0,$hi0,AT	#L0
Packit c4476c
	addq	$nlo,$hi1,$lo1	#L1
Packit c4476c
	addl	$j,1,$j
Packit c4476c
Packit c4476c
	mulq	$nj,$m1,$nlo	#U1
Packit c4476c
	addq	$ahi,AT,$hi0	#L0
Packit c4476c
	addq	$lo0,$tj,$lo0	#L1
Packit c4476c
	cmpult	$lo1,$hi1,v0	#U0
Packit c4476c
Packit c4476c
	umulh	$aj,$bi,$ahi	#U1
Packit c4476c
	cmpult	$lo0,$tj,AT	#L0
Packit c4476c
	addq	$lo1,$lo0,$lo1	#L1
Packit c4476c
	addq	$nhi,v0,$hi1	#U0
Packit c4476c
Packit c4476c
	umulh	$nj,$m1,$nhi	#U1
Packit c4476c
	s8addq	$j,$ap,$aj	#L0
Packit c4476c
	cmpult	$lo1,$lo0,v0	#L1
Packit c4476c
	cmplt	$j,$num,$tj	#U0	# borrow $tj
Packit c4476c
Packit c4476c
	addq	$hi0,AT,$hi0	#L0
Packit c4476c
	addq	$hi1,v0,$hi1	#U1
Packit c4476c
	stq	$lo1,-8($tp)	#L1
Packit c4476c
	bne	$tj,.Linner	#U0
Packit c4476c
	.set	reorder
Packit c4476c
Packit c4476c
	ldq	$tj,8($tp)
Packit c4476c
	addq	$alo,$hi0,$lo0
Packit c4476c
	addq	$nlo,$hi1,$lo1
Packit c4476c
	cmpult	$lo0,$hi0,AT
Packit c4476c
	cmpult	$lo1,$hi1,v0
Packit c4476c
	addq	$ahi,AT,$hi0
Packit c4476c
	addq	$nhi,v0,$hi1
Packit c4476c
Packit c4476c
	addq	$lo0,$tj,$lo0
Packit c4476c
	cmpult	$lo0,$tj,AT
Packit c4476c
	addq	$hi0,AT,$hi0
Packit c4476c
Packit c4476c
	ldq	$tj,16($tp)
Packit c4476c
	addq	$lo1,$lo0,$j
Packit c4476c
	cmpult	$j,$lo0,v0
Packit c4476c
	addq	$hi1,v0,$hi1
Packit c4476c
Packit c4476c
	addq	$hi1,$hi0,$lo1
Packit c4476c
	stq	$j,0($tp)
Packit c4476c
	cmpult	$lo1,$hi0,$hi1
Packit c4476c
	addq	$lo1,$tj,$lo1
Packit c4476c
	cmpult	$lo1,$tj,AT
Packit c4476c
	addl	$i,1,$i
Packit c4476c
	addq	$hi1,AT,$hi1
Packit c4476c
	stq	$lo1,8($tp)
Packit c4476c
	cmplt	$i,$num,$tj	# borrow $tj
Packit c4476c
	stq	$hi1,16($tp)
Packit c4476c
	bne	$tj,.Louter
Packit c4476c

Packit c4476c
	s8addq	$num,sp,$tj	# &tp[num]
Packit c4476c
	mov	$rp,$bp		# put rp aside
Packit c4476c
	mov	sp,$tp
Packit c4476c
	mov	sp,$ap
Packit c4476c
	mov	0,$hi0		# clear borrow bit
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lsub:	ldq	$lo0,0($tp)
Packit c4476c
	ldq	$lo1,0($np)
Packit c4476c
	lda	$tp,8($tp)
Packit c4476c
	lda	$np,8($np)
Packit c4476c
	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
Packit c4476c
	cmpult	$lo0,$lo1,AT
Packit c4476c
	subq	$lo1,$hi0,$lo0
Packit c4476c
	cmpult	$lo1,$lo0,$hi0
Packit c4476c
	or	$hi0,AT,$hi0
Packit c4476c
	stq	$lo0,0($rp)
Packit c4476c
	cmpult	$tp,$tj,v0
Packit c4476c
	lda	$rp,8($rp)
Packit c4476c
	bne	v0,.Lsub
Packit c4476c
Packit c4476c
	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
Packit c4476c
	mov	sp,$tp
Packit c4476c
	mov	$bp,$rp		# restore rp
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lcopy:	ldq	$aj,0($tp)	# conditional copy
Packit c4476c
	ldq	$nj,0($rp)
Packit c4476c
	lda	$tp,8($tp)
Packit c4476c
	lda	$rp,8($rp)
Packit c4476c
	cmoveq	$hi0,$nj,$aj
Packit c4476c
	stq	zero,-8($tp)	# zap tp
Packit c4476c
	cmpult	$tp,$tj,AT
Packit c4476c
	stq	$aj,-8($rp)
Packit c4476c
	bne	AT,.Lcopy
Packit c4476c
	mov	1,v0
Packit c4476c
Packit c4476c
.Lexit:
Packit c4476c
	.set	noreorder
Packit c4476c
	mov	fp,sp
Packit c4476c
	/*ldq	ra,0(sp)*/
Packit c4476c
	ldq	s3,8(sp)
Packit c4476c
	ldq	s4,16(sp)
Packit c4476c
	ldq	s5,24(sp)
Packit c4476c
	ldq	fp,32(sp)
Packit c4476c
	lda	sp,48(sp)
Packit c4476c
	ret	(ra)
Packit c4476c
.end	bn_mul_mont
Packit c4476c
.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
.align	2
Packit c4476c
___
Packit c4476c
Packit c4476c
print $code;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";