Blame crypto/bn/asm/ppc.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
# Implemented as a Perl wrapper as we want to support several different
Packit c4476c
# architectures with single file. We pick up the target based on the
Packit c4476c
# file name we are asked to generate.
Packit c4476c
#
Packit c4476c
# It should be noted though that this perl code is nothing like
Packit c4476c
# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
Packit c4476c
# as pre-processor to cover for platform differences in name decoration,
Packit c4476c
# linker tables, 32-/64-bit instruction sets...
Packit c4476c
#
Packit c4476c
# As you might know there're several PowerPC ABI in use. Most notably
Packit c4476c
# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
Packit c4476c
# are similar enough to implement leaf(!) functions, which would be ABI
Packit c4476c
# neutral. And that's what you find here: ABI neutral leaf functions.
Packit c4476c
# In case you wonder what that is...
Packit c4476c
#
Packit c4476c
#       AIX performance
Packit c4476c
#
Packit c4476c
#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
Packit c4476c
#
Packit c4476c
#	The following is the performance of 32-bit compiler
Packit c4476c
#	generated code:
Packit c4476c
#
Packit c4476c
#	OpenSSL 0.9.6c 21 dec 2001
Packit c4476c
#	built on: Tue Jun 11 11:06:51 EDT 2002
Packit c4476c
#	options:bn(64,32) ...
Packit c4476c
#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
Packit c4476c
#                  sign    verify    sign/s verify/s
Packit c4476c
#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
Packit c4476c
#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
Packit c4476c
#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
Packit c4476c
#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
Packit c4476c
#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
Packit c4476c
#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
Packit c4476c
#
Packit c4476c
#	Same benchmark with this assembler code:
Packit c4476c
#
Packit c4476c
#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
Packit c4476c
#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
Packit c4476c
#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
Packit c4476c
#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
Packit c4476c
#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
Packit c4476c
#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
Packit c4476c
#
Packit c4476c
#	Number of operations increases by at almost 75%
Packit c4476c
#
Packit c4476c
#	Here are performance numbers for 64-bit compiler
Packit c4476c
#	generated code:
Packit c4476c
#
Packit c4476c
#	OpenSSL 0.9.6g [engine] 9 Aug 2002
Packit c4476c
#	built on: Fri Apr 18 16:59:20 EDT 2003
Packit c4476c
#	options:bn(64,64) ...
Packit c4476c
#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
Packit c4476c
#                  sign    verify    sign/s verify/s
Packit c4476c
#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
Packit c4476c
#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
Packit c4476c
#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
Packit c4476c
#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
Packit c4476c
#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
Packit c4476c
#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
Packit c4476c
#
Packit c4476c
#	Same benchmark with this assembler code:
Packit c4476c
#
Packit c4476c
#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
Packit c4476c
#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
Packit c4476c
#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
Packit c4476c
#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
Packit c4476c
#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
Packit c4476c
#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
Packit c4476c
#
Packit c4476c
#	Again, performance increases by at about 75%
Packit c4476c
#
Packit c4476c
#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
Packit c4476c
#       OpenSSL 0.9.7c 30 Sep 2003
Packit c4476c
#
Packit c4476c
#       Original code.
Packit c4476c
#
Packit c4476c
#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
Packit c4476c
#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
Packit c4476c
#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
Packit c4476c
#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
Packit c4476c
#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
Packit c4476c
#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
Packit c4476c
#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
Packit c4476c
#
Packit c4476c
#       Same benchmark with this assembler code:
Packit c4476c
#
Packit c4476c
#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
Packit c4476c
#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
Packit c4476c
#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
Packit c4476c
#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
Packit c4476c
#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
Packit c4476c
#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
Packit c4476c
#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
Packit c4476c
#
Packit c4476c
#        Performance increase of ~60%
Packit c4476c
#        Based on submission from Suresh N. Chari of IBM
Packit c4476c
Packit c4476c
$flavour = shift;
Packit c4476c
Packit c4476c
if ($flavour =~ /32/) {
Packit c4476c
	$BITS=	32;
Packit c4476c
	$BNSZ=	$BITS/8;
Packit c4476c
	$ISA=	"\"ppc\"";
Packit c4476c
Packit c4476c
	$LD=	"lwz";		# load
Packit c4476c
	$LDU=	"lwzu";		# load and update
Packit c4476c
	$ST=	"stw";		# store
Packit c4476c
	$STU=	"stwu";		# store and update
Packit c4476c
	$UMULL=	"mullw";	# unsigned multiply low
Packit c4476c
	$UMULH=	"mulhwu";	# unsigned multiply high
Packit c4476c
	$UDIV=	"divwu";	# unsigned divide
Packit c4476c
	$UCMPI=	"cmplwi";	# unsigned compare with immediate
Packit c4476c
	$UCMP=	"cmplw";	# unsigned compare
Packit c4476c
	$CNTLZ=	"cntlzw";	# count leading zeros
Packit c4476c
	$SHL=	"slw";		# shift left
Packit c4476c
	$SHR=	"srw";		# unsigned shift right
Packit c4476c
	$SHRI=	"srwi";		# unsigned shift right by immediate
Packit c4476c
	$SHLI=	"slwi";		# shift left by immediate
Packit c4476c
	$CLRU=	"clrlwi";	# clear upper bits
Packit c4476c
	$INSR=	"insrwi";	# insert right
Packit c4476c
	$ROTL=	"rotlwi";	# rotate left by immediate
Packit c4476c
	$TR=	"tw";		# conditional trap
Packit c4476c
} elsif ($flavour =~ /64/) {
Packit c4476c
	$BITS=	64;
Packit c4476c
	$BNSZ=	$BITS/8;
Packit c4476c
	$ISA=	"\"ppc64\"";
Packit c4476c
Packit c4476c
	# same as above, but 64-bit mnemonics...
Packit c4476c
	$LD=	"ld";		# load
Packit c4476c
	$LDU=	"ldu";		# load and update
Packit c4476c
	$ST=	"std";		# store
Packit c4476c
	$STU=	"stdu";		# store and update
Packit c4476c
	$UMULL=	"mulld";	# unsigned multiply low
Packit c4476c
	$UMULH=	"mulhdu";	# unsigned multiply high
Packit c4476c
	$UDIV=	"divdu";	# unsigned divide
Packit c4476c
	$UCMPI=	"cmpldi";	# unsigned compare with immediate
Packit c4476c
	$UCMP=	"cmpld";	# unsigned compare
Packit c4476c
	$CNTLZ=	"cntlzd";	# count leading zeros
Packit c4476c
	$SHL=	"sld";		# shift left
Packit c4476c
	$SHR=	"srd";		# unsigned shift right
Packit c4476c
	$SHRI=	"srdi";		# unsigned shift right by immediate
Packit c4476c
	$SHLI=	"sldi";		# shift left by immediate
Packit c4476c
	$CLRU=	"clrldi";	# clear upper bits
Packit c4476c
	$INSR=	"insrdi";	# insert right
Packit c4476c
	$ROTL=	"rotldi";	# rotate left by immediate
Packit c4476c
	$TR=	"td";		# conditional trap
Packit c4476c
} else { die "nonsense $flavour"; }
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
Packit c4476c
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
Packit c4476c
die "can't locate ppc-xlate.pl";
Packit c4476c
Packit c4476c
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
Packit c4476c
Packit c4476c
$data=<
Packit c4476c
#--------------------------------------------------------------------
Packit c4476c
#
Packit c4476c
#
Packit c4476c
#
Packit c4476c
#
Packit c4476c
#	File:		ppc32.s
Packit c4476c
#
Packit c4476c
#	Created by:	Suresh Chari
Packit c4476c
#			IBM Thomas J. Watson Research Library
Packit c4476c
#			Hawthorne, NY
Packit c4476c
#
Packit c4476c
#
Packit c4476c
#	Description:	Optimized assembly routines for OpenSSL crypto
Packit c4476c
#			on the 32 bitPowerPC platform.
Packit c4476c
#
Packit c4476c
#
Packit c4476c
#	Version History
Packit c4476c
#
Packit c4476c
#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
Packit c4476c
#	   cleaned up code. Also made a single version which can
Packit c4476c
#	   be used for both the AIX and Linux compilers. See NOTE
Packit c4476c
#	   below.
Packit c4476c
#				12/05/03		Suresh Chari
Packit c4476c
#			(with lots of help from)        Andy Polyakov
Packit c4476c
##
Packit c4476c
#	1. Initial version	10/20/02		Suresh Chari
Packit c4476c
#
Packit c4476c
#
Packit c4476c
#	The following file works for the xlc,cc
Packit c4476c
#	and gcc compilers.
Packit c4476c
#
Packit c4476c
#	NOTE:	To get the file to link correctly with the gcc compiler
Packit c4476c
#	        you have to change the names of the routines and remove
Packit c4476c
#		the first .(dot) character. This should automatically
Packit c4476c
#		be done in the build process.
Packit c4476c
#
Packit c4476c
#	Hand optimized assembly code for the following routines
Packit c4476c
#
Packit c4476c
#	bn_sqr_comba4
Packit c4476c
#	bn_sqr_comba8
Packit c4476c
#	bn_mul_comba4
Packit c4476c
#	bn_mul_comba8
Packit c4476c
#	bn_sub_words
Packit c4476c
#	bn_add_words
Packit c4476c
#	bn_div_words
Packit c4476c
#	bn_sqr_words
Packit c4476c
#	bn_mul_words
Packit c4476c
#	bn_mul_add_words
Packit c4476c
#
Packit c4476c
#	NOTE:	It is possible to optimize this code more for
Packit c4476c
#	specific PowerPC or Power architectures. On the Northstar
Packit c4476c
#	architecture the optimizations in this file do
Packit c4476c
#	 NOT provide much improvement.
Packit c4476c
#
Packit c4476c
#	If you have comments or suggestions to improve code send
Packit c4476c
#	me a note at schari\@us.ibm.com
Packit c4476c
#
Packit c4476c
#--------------------------------------------------------------------------
Packit c4476c
#
Packit c4476c
#	Defines to be used in the assembly code.
Packit c4476c
#
Packit c4476c
#.set r0,0	# we use it as storage for value of 0
Packit c4476c
#.set SP,1	# preserved
Packit c4476c
#.set RTOC,2	# preserved
Packit c4476c
#.set r3,3	# 1st argument/return value
Packit c4476c
#.set r4,4	# 2nd argument/volatile register
Packit c4476c
#.set r5,5	# 3rd argument/volatile register
Packit c4476c
#.set r6,6	# ...
Packit c4476c
#.set r7,7
Packit c4476c
#.set r8,8
Packit c4476c
#.set r9,9
Packit c4476c
#.set r10,10
Packit c4476c
#.set r11,11
Packit c4476c
#.set r12,12
Packit c4476c
#.set r13,13	# not used, nor any other "below" it...
Packit c4476c
Packit c4476c
#	Declare function names to be global
Packit c4476c
#	NOTE:	For gcc these names MUST be changed to remove
Packit c4476c
#	        the first . i.e. for example change ".bn_sqr_comba4"
Packit c4476c
#		to "bn_sqr_comba4". This should be automatically done
Packit c4476c
#		in the build.
Packit c4476c
Packit c4476c
	.globl	.bn_sqr_comba4
Packit c4476c
	.globl	.bn_sqr_comba8
Packit c4476c
	.globl	.bn_mul_comba4
Packit c4476c
	.globl	.bn_mul_comba8
Packit c4476c
	.globl	.bn_sub_words
Packit c4476c
	.globl	.bn_add_words
Packit c4476c
	.globl	.bn_div_words
Packit c4476c
	.globl	.bn_sqr_words
Packit c4476c
	.globl	.bn_mul_words
Packit c4476c
	.globl	.bn_mul_add_words
Packit c4476c
Packit c4476c
# .text section
Packit c4476c
Packit c4476c
	.machine	"any"
Packit c4476c
	.text
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_sqr_comba4" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_sqr_comba4:
Packit c4476c
#
Packit c4476c
# Optimized version of bn_sqr_comba4.
Packit c4476c
#
Packit c4476c
# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
Packit c4476c
# r3 contains r
Packit c4476c
# r4 contains a
Packit c4476c
#
Packit c4476c
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
Packit c4476c
#
Packit c4476c
# r5,r6 are the two BN_ULONGs being multiplied.
Packit c4476c
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
Packit c4476c
# r9,r10, r11 are the equivalents of c1,c2, c3.
Packit c4476c
# Here's the assembly
Packit c4476c
#
Packit c4476c
#
Packit c4476c
	xor		r0,r0,r0		# set r0 = 0. Used in the addze
Packit c4476c
						# instructions below
Packit c4476c
Packit c4476c
						#sqr_add_c(a,0,c1,c2,c3)
Packit c4476c
	$LD		r5,`0*$BNSZ`(r4)
Packit c4476c
	$UMULL		r9,r5,r5
Packit c4476c
	$UMULH		r10,r5,r5		#in first iteration. No need
Packit c4476c
						#to add since c1=c2=c3=0.
Packit c4476c
						# Note c3(r11) is NOT set to 0
Packit c4476c
						# but will be.
Packit c4476c
Packit c4476c
	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
Packit c4476c
						# sqr_add_c2(a,1,0,c2,c3,c1);
Packit c4476c
	$LD		r6,`1*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
Packit c4476c
	adde		r8,r8,r8
Packit c4476c
	addze		r9,r0			# catch carry if any.
Packit c4476c
						# r9= r0(=0) and carry
Packit c4476c
Packit c4476c
	addc		r10,r7,r10		# now add to temp result.
Packit c4476c
	addze		r11,r8                  # r8 added to r11 which is 0
Packit c4476c
	addze		r9,r9
Packit c4476c
Packit c4476c
	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
Packit c4476c
						#sqr_add_c(a,1,c3,c1,c2)
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r0
Packit c4476c
						#sqr_add_c2(a,2,0,c3,c1,c2)
Packit c4476c
	$LD		r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r7,r7,r7
Packit c4476c
	adde		r8,r8,r8
Packit c4476c
	addze		r10,r10
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
Packit c4476c
						#sqr_add_c2(a,3,0,c1,c2,c3);
Packit c4476c
	$LD		r6,`3*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r7,r7,r7
Packit c4476c
	adde		r8,r8,r8
Packit c4476c
	addze		r11,r0
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
						#sqr_add_c2(a,2,1,c1,c2,c3);
Packit c4476c
	$LD		r5,`1*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r7,r7,r7
Packit c4476c
	adde		r8,r8,r8
Packit c4476c
	addze		r11,r11
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
Packit c4476c
						#sqr_add_c(a,2,c2,c3,c1);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r0
Packit c4476c
						#sqr_add_c2(a,3,1,c2,c3,c1);
Packit c4476c
	$LD		r6,`3*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r7,r7,r7
Packit c4476c
	adde		r8,r8,r8
Packit c4476c
	addze		r9,r9
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
Packit c4476c
						#sqr_add_c2(a,3,2,c3,c1,c2);
Packit c4476c
	$LD		r5,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r7,r7,r7
Packit c4476c
	adde		r8,r8,r8
Packit c4476c
	addze		r10,r0
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
Packit c4476c
						#sqr_add_c(a,3,c1,c2,c3);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
Packit c4476c
	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
Packit c4476c
	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,2,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_sqr_comba4,.-.bn_sqr_comba4
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_sqr_comba8" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_sqr_comba8:
Packit c4476c
#
Packit c4476c
# This is an optimized version of the bn_sqr_comba8 routine.
Packit c4476c
# Tightly uses the adde instruction
Packit c4476c
#
Packit c4476c
#
Packit c4476c
# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
Packit c4476c
# r3 contains r
Packit c4476c
# r4 contains a
Packit c4476c
#
Packit c4476c
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
Packit c4476c
#
Packit c4476c
# r5,r6 are the two BN_ULONGs being multiplied.
Packit c4476c
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
Packit c4476c
# r9,r10, r11 are the equivalents of c1,c2, c3.
Packit c4476c
#
Packit c4476c
# Possible optimization of loading all 8 longs of a into registers
Packit c4476c
# doesn't provide any speedup
Packit c4476c
#
Packit c4476c
Packit c4476c
	xor		r0,r0,r0		#set r0 = 0.Used in addze
Packit c4476c
						#instructions below.
Packit c4476c
Packit c4476c
						#sqr_add_c(a,0,c1,c2,c3);
Packit c4476c
	$LD		r5,`0*$BNSZ`(r4)
Packit c4476c
	$UMULL		r9,r5,r5		#1st iteration:	no carries.
Packit c4476c
	$UMULH		r10,r5,r5
Packit c4476c
	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
Packit c4476c
						#sqr_add_c2(a,1,0,c2,c3,c1);
Packit c4476c
	$LD		r6,`1*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10		#add the two register number
Packit c4476c
	adde		r11,r8,r0 		# (r8,r7) to the three register
Packit c4476c
	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
Packit c4476c
Packit c4476c
	addc		r10,r7,r10		#add the two register number
Packit c4476c
	adde		r11,r8,r11 		# (r8,r7) to the three register
Packit c4476c
	addze		r9,r9			# number (r9,r11,r10).
Packit c4476c
Packit c4476c
	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
Packit c4476c
Packit c4476c
						#sqr_add_c(a,1,c3,c1,c2);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r0
Packit c4476c
						#sqr_add_c2(a,2,0,c3,c1,c2);
Packit c4476c
	$LD		r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
Packit c4476c
	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
Packit c4476c
						#sqr_add_c2(a,3,0,c1,c2,c3);
Packit c4476c
	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r0
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
						#sqr_add_c2(a,2,1,c1,c2,c3);
Packit c4476c
	$LD		r5,`1*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
Packit c4476c
	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
Packit c4476c
						#sqr_add_c(a,2,c2,c3,c1);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r0
Packit c4476c
						#sqr_add_c2(a,3,1,c2,c3,c1);
Packit c4476c
	$LD		r6,`3*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
						#sqr_add_c2(a,4,0,c2,c3,c1);
Packit c4476c
	$LD		r5,`0*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
Packit c4476c
						#sqr_add_c2(a,5,0,c3,c1,c2);
Packit c4476c
	$LD		r6,`5*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r0
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
						#sqr_add_c2(a,4,1,c3,c1,c2);
Packit c4476c
	$LD		r5,`1*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
						#sqr_add_c2(a,3,2,c3,c1,c2);
Packit c4476c
	$LD		r5,`2*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`3*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
Packit c4476c
						#sqr_add_c(a,3,c1,c2,c3);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r0
Packit c4476c
						#sqr_add_c2(a,4,2,c1,c2,c3);
Packit c4476c
	$LD		r6,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
						#sqr_add_c2(a,5,1,c1,c2,c3);
Packit c4476c
	$LD		r5,`1*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`5*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
						#sqr_add_c2(a,6,0,c1,c2,c3);
Packit c4476c
	$LD		r5,`0*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
Packit c4476c
						#sqr_add_c2(a,7,0,c2,c3,c1);
Packit c4476c
	$LD		r6,`7*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r0
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
						#sqr_add_c2(a,6,1,c2,c3,c1);
Packit c4476c
	$LD		r5,`1*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
						#sqr_add_c2(a,5,2,c2,c3,c1);
Packit c4476c
	$LD		r5,`2*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`5*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
						#sqr_add_c2(a,4,3,c2,c3,c1);
Packit c4476c
	$LD		r5,`3*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
Packit c4476c
						#sqr_add_c(a,4,c3,c1,c2);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r0
Packit c4476c
						#sqr_add_c2(a,5,3,c3,c1,c2);
Packit c4476c
	$LD		r6,`5*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
						#sqr_add_c2(a,6,2,c3,c1,c2);
Packit c4476c
	$LD		r5,`2*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
						#sqr_add_c2(a,7,1,c3,c1,c2);
Packit c4476c
	$LD		r5,`1*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`7*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
Packit c4476c
						#sqr_add_c2(a,7,2,c1,c2,c3);
Packit c4476c
	$LD		r5,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r0
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
						#sqr_add_c2(a,6,3,c1,c2,c3);
Packit c4476c
	$LD		r5,`3*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
						#sqr_add_c2(a,5,4,c1,c2,c3);
Packit c4476c
	$LD		r5,`4*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`5*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
Packit c4476c
						#sqr_add_c(a,5,c2,c3,c1);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r0
Packit c4476c
						#sqr_add_c2(a,6,4,c2,c3,c1);
Packit c4476c
	$LD		r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
						#sqr_add_c2(a,7,3,c2,c3,c1);
Packit c4476c
	$LD		r5,`3*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`7*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
Packit c4476c
						#sqr_add_c2(a,7,4,c3,c1,c2);
Packit c4476c
	$LD		r5,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r0
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
						#sqr_add_c2(a,6,5,c3,c1,c2);
Packit c4476c
	$LD		r5,`5*$BNSZ`(r4)
Packit c4476c
	$LD		r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	addze		r10,r10
Packit c4476c
	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
Packit c4476c
						#sqr_add_c(a,6,c1,c2,c3);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r0
Packit c4476c
						#sqr_add_c2(a,7,5,c1,c2,c3)
Packit c4476c
	$LD		r6,`7*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	addc		r9,r7,r9
Packit c4476c
	adde		r10,r8,r10
Packit c4476c
	addze		r11,r11
Packit c4476c
	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
Packit c4476c
Packit c4476c
						#sqr_add_c2(a,7,6,c2,c3,c1)
Packit c4476c
	$LD		r5,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL		r7,r5,r6
Packit c4476c
	$UMULH		r8,r5,r6
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r0
Packit c4476c
	addc		r10,r7,r10
Packit c4476c
	adde		r11,r8,r11
Packit c4476c
	addze		r9,r9
Packit c4476c
	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
Packit c4476c
						#sqr_add_c(a,7,c3,c1,c2);
Packit c4476c
	$UMULL		r7,r6,r6
Packit c4476c
	$UMULH		r8,r6,r6
Packit c4476c
	addc		r11,r7,r11
Packit c4476c
	adde		r9,r8,r9
Packit c4476c
	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
Packit c4476c
	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
Packit c4476c
Packit c4476c
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,2,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_sqr_comba8,.-.bn_sqr_comba8
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_mul_comba4" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_mul_comba4:
Packit c4476c
#
Packit c4476c
# This is an optimized version of the bn_mul_comba4 routine.
Packit c4476c
#
Packit c4476c
# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
Packit c4476c
# r3 contains r
Packit c4476c
# r4 contains a
Packit c4476c
# r5 contains b
Packit c4476c
# r6, r7 are the 2 BN_ULONGs being multiplied.
Packit c4476c
# r8, r9 are the results of the 32x32 giving 64 multiply.
Packit c4476c
# r10, r11, r12 are the equivalents of c1, c2, and c3.
Packit c4476c
#
Packit c4476c
	xor	r0,r0,r0		#r0=0. Used in addze below.
Packit c4476c
					#mul_add_c(a[0],b[0],c1,c2,c3);
Packit c4476c
	$LD	r6,`0*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r10,r6,r7
Packit c4476c
	$UMULH	r11,r6,r7
Packit c4476c
	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
Packit c4476c
					#mul_add_c(a[0],b[1],c2,c3,c1);
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r8,r11
Packit c4476c
	adde	r12,r9,r0
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[1],b[0],c2,c3,c1);
Packit c4476c
	$LD	r6, `1*$BNSZ`(r4)
Packit c4476c
	$LD	r7, `0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r8,r11
Packit c4476c
	adde	r12,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
Packit c4476c
					#mul_add_c(a[2],b[0],c3,c1,c2);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r8,r12
Packit c4476c
	adde	r10,r9,r10
Packit c4476c
	addze	r11,r0
Packit c4476c
					#mul_add_c(a[1],b[1],c3,c1,c2);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r8,r12
Packit c4476c
	adde	r10,r9,r10
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[0],b[2],c3,c1,c2);
Packit c4476c
	$LD	r6,`0*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r8,r12
Packit c4476c
	adde	r10,r9,r10
Packit c4476c
	addze	r11,r11
Packit c4476c
	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
Packit c4476c
					#mul_add_c(a[0],b[3],c1,c2,c3);
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r8,r10
Packit c4476c
	adde	r11,r9,r11
Packit c4476c
	addze	r12,r0
Packit c4476c
					#mul_add_c(a[1],b[2],c1,c2,c3);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r8,r10
Packit c4476c
	adde	r11,r9,r11
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[2],b[1],c1,c2,c3);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r8,r10
Packit c4476c
	adde	r11,r9,r11
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[3],b[0],c1,c2,c3);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r8,r10
Packit c4476c
	adde	r11,r9,r11
Packit c4476c
	addze	r12,r12
Packit c4476c
	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
Packit c4476c
					#mul_add_c(a[3],b[1],c2,c3,c1);
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r8,r11
Packit c4476c
	adde	r12,r9,r12
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[2],b[2],c2,c3,c1);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r8,r11
Packit c4476c
	adde	r12,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[1],b[3],c2,c3,c1);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r8,r11
Packit c4476c
	adde	r12,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
Packit c4476c
					#mul_add_c(a[2],b[3],c3,c1,c2);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r8,r12
Packit c4476c
	adde	r10,r9,r10
Packit c4476c
	addze	r11,r0
Packit c4476c
					#mul_add_c(a[3],b[2],c3,c1,c2);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r8,r12
Packit c4476c
	adde	r10,r9,r10
Packit c4476c
	addze	r11,r11
Packit c4476c
	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
Packit c4476c
					#mul_add_c(a[3],b[3],c1,c2,c3);
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r8,r10
Packit c4476c
	adde	r11,r9,r11
Packit c4476c
Packit c4476c
	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
Packit c4476c
	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,3,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_mul_comba4,.-.bn_mul_comba4
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_mul_comba8" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_mul_comba8:
Packit c4476c
#
Packit c4476c
# Optimized version of the bn_mul_comba8 routine.
Packit c4476c
#
Packit c4476c
# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
Packit c4476c
# r3 contains r
Packit c4476c
# r4 contains a
Packit c4476c
# r5 contains b
Packit c4476c
# r6, r7 are the 2 BN_ULONGs being multiplied.
Packit c4476c
# r8, r9 are the results of the 32x32 giving 64 multiply.
Packit c4476c
# r10, r11, r12 are the equivalents of c1, c2, and c3.
Packit c4476c
#
Packit c4476c
	xor	r0,r0,r0		#r0=0. Used in addze below.
Packit c4476c
Packit c4476c
					#mul_add_c(a[0],b[0],c1,c2,c3);
Packit c4476c
	$LD	r6,`0*$BNSZ`(r4)	#a[0]
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)	#b[0]
Packit c4476c
	$UMULL	r10,r6,r7
Packit c4476c
	$UMULH	r11,r6,r7
Packit c4476c
	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
Packit c4476c
					#mul_add_c(a[0],b[1],c2,c3,c1);
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	addze	r12,r9			# since we didn't set r12 to zero before.
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[1],b[0],c2,c3,c1);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
Packit c4476c
					#mul_add_c(a[2],b[0],c3,c1,c2);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r0
Packit c4476c
					#mul_add_c(a[1],b[1],c3,c1,c2);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[0],b[2],c3,c1,c2);
Packit c4476c
	$LD	r6,`0*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
Packit c4476c
					#mul_add_c(a[0],b[3],c1,c2,c3);
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r0
Packit c4476c
					#mul_add_c(a[1],b[2],c1,c2,c3);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
Packit c4476c
					#mul_add_c(a[2],b[1],c1,c2,c3);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[3],b[0],c1,c2,c3);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
Packit c4476c
					#mul_add_c(a[4],b[0],c2,c3,c1);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[3],b[1],c2,c3,c1);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[2],b[2],c2,c3,c1);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[1],b[3],c2,c3,c1);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[0],b[4],c2,c3,c1);
Packit c4476c
	$LD	r6,`0*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
Packit c4476c
					#mul_add_c(a[0],b[5],c3,c1,c2);
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r0
Packit c4476c
					#mul_add_c(a[1],b[4],c3,c1,c2);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[2],b[3],c3,c1,c2);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[3],b[2],c3,c1,c2);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[4],b[1],c3,c1,c2);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[5],b[0],c3,c1,c2);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
Packit c4476c
					#mul_add_c(a[6],b[0],c1,c2,c3);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r0
Packit c4476c
					#mul_add_c(a[5],b[1],c1,c2,c3);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[4],b[2],c1,c2,c3);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[3],b[3],c1,c2,c3);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[2],b[4],c1,c2,c3);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[1],b[5],c1,c2,c3);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[0],b[6],c1,c2,c3);
Packit c4476c
	$LD	r6,`0*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
Packit c4476c
					#mul_add_c(a[0],b[7],c2,c3,c1);
Packit c4476c
	$LD	r7,`7*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[1],b[6],c2,c3,c1);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[2],b[5],c2,c3,c1);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[3],b[4],c2,c3,c1);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[4],b[3],c2,c3,c1);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[5],b[2],c2,c3,c1);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[6],b[1],c2,c3,c1);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[7],b[0],c2,c3,c1);
Packit c4476c
	$LD	r6,`7*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`0*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
Packit c4476c
					#mul_add_c(a[7],b[1],c3,c1,c2);
Packit c4476c
	$LD	r7,`1*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r0
Packit c4476c
					#mul_add_c(a[6],b[2],c3,c1,c2);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[5],b[3],c3,c1,c2);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[4],b[4],c3,c1,c2);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[3],b[5],c3,c1,c2);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[2],b[6],c3,c1,c2);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[1],b[7],c3,c1,c2);
Packit c4476c
	$LD	r6,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`7*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
Packit c4476c
					#mul_add_c(a[2],b[7],c1,c2,c3);
Packit c4476c
	$LD	r6,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r0
Packit c4476c
					#mul_add_c(a[3],b[6],c1,c2,c3);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[4],b[5],c1,c2,c3);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[5],b[4],c1,c2,c3);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[6],b[3],c1,c2,c3);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[7],b[2],c1,c2,c3);
Packit c4476c
	$LD	r6,`7*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`2*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
Packit c4476c
					#mul_add_c(a[7],b[3],c2,c3,c1);
Packit c4476c
	$LD	r7,`3*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[6],b[4],c2,c3,c1);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[5],b[5],c2,c3,c1);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[4],b[6],c2,c3,c1);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
					#mul_add_c(a[3],b[7],c2,c3,c1);
Packit c4476c
	$LD	r6,`3*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`7*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
Packit c4476c
					#mul_add_c(a[4],b[7],c3,c1,c2);
Packit c4476c
	$LD	r6,`4*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r0
Packit c4476c
					#mul_add_c(a[5],b[6],c3,c1,c2);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[6],b[5],c3,c1,c2);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
					#mul_add_c(a[7],b[4],c3,c1,c2);
Packit c4476c
	$LD	r6,`7*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`4*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	addze	r11,r11
Packit c4476c
	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
Packit c4476c
					#mul_add_c(a[7],b[5],c1,c2,c3);
Packit c4476c
	$LD	r7,`5*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r0
Packit c4476c
					#mul_add_c(a[6],b[6],c1,c2,c3);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
					#mul_add_c(a[5],b[7],c1,c2,c3);
Packit c4476c
	$LD	r6,`5*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`7*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r10,r10,r8
Packit c4476c
	adde	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
Packit c4476c
					#mul_add_c(a[6],b[7],c2,c3,c1);
Packit c4476c
	$LD	r6,`6*$BNSZ`(r4)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r0
Packit c4476c
					#mul_add_c(a[7],b[6],c2,c3,c1);
Packit c4476c
	$LD	r6,`7*$BNSZ`(r4)
Packit c4476c
	$LD	r7,`6*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r11,r11,r8
Packit c4476c
	adde	r12,r12,r9
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
Packit c4476c
					#mul_add_c(a[7],b[7],c3,c1,c2);
Packit c4476c
	$LD	r7,`7*$BNSZ`(r5)
Packit c4476c
	$UMULL	r8,r6,r7
Packit c4476c
	$UMULH	r9,r6,r7
Packit c4476c
	addc	r12,r12,r8
Packit c4476c
	adde	r10,r10,r9
Packit c4476c
	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
Packit c4476c
	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,3,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_mul_comba8,.-.bn_mul_comba8
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_sub_words" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
#
Packit c4476c
.align	4
Packit c4476c
.bn_sub_words:
Packit c4476c
#
Packit c4476c
#	Handcoded version of bn_sub_words
Packit c4476c
#
Packit c4476c
#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
Packit c4476c
#
Packit c4476c
#	r3 = r
Packit c4476c
#	r4 = a
Packit c4476c
#	r5 = b
Packit c4476c
#	r6 = n
Packit c4476c
#
Packit c4476c
#       Note:	No loop unrolling done since this is not a performance
Packit c4476c
#               critical loop.
Packit c4476c
Packit c4476c
	xor	r0,r0,r0	#set r0 = 0
Packit c4476c
#
Packit c4476c
#	check for r6 = 0 AND set carry bit.
Packit c4476c
#
Packit c4476c
	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
Packit c4476c
				# if r6 > 0 then result !=0
Packit c4476c
				# In either case carry bit is set.
Packit c4476c
	beq	Lppcasm_sub_adios
Packit c4476c
	addi	r4,r4,-$BNSZ
Packit c4476c
	addi	r3,r3,-$BNSZ
Packit c4476c
	addi	r5,r5,-$BNSZ
Packit c4476c
	mtctr	r6
Packit c4476c
Lppcasm_sub_mainloop:
Packit c4476c
	$LDU	r7,$BNSZ(r4)
Packit c4476c
	$LDU	r8,$BNSZ(r5)
Packit c4476c
	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
Packit c4476c
				# if carry = 1 this is r7-r8. Else it
Packit c4476c
				# is r7-r8 -1 as we need.
Packit c4476c
	$STU	r6,$BNSZ(r3)
Packit c4476c
	bdnz	Lppcasm_sub_mainloop
Packit c4476c
Lppcasm_sub_adios:
Packit c4476c
	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
Packit c4476c
	andi.	r3,r3,1         # keep only last bit.
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,4,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_sub_words,.-.bn_sub_words
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_add_words" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_add_words:
Packit c4476c
#
Packit c4476c
#	Handcoded version of bn_add_words
Packit c4476c
#
Packit c4476c
#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
Packit c4476c
#
Packit c4476c
#	r3 = r
Packit c4476c
#	r4 = a
Packit c4476c
#	r5 = b
Packit c4476c
#	r6 = n
Packit c4476c
#
Packit c4476c
#       Note:	No loop unrolling done since this is not a performance
Packit c4476c
#               critical loop.
Packit c4476c
Packit c4476c
	xor	r0,r0,r0
Packit c4476c
#
Packit c4476c
#	check for r6 = 0. Is this needed?
Packit c4476c
#
Packit c4476c
	addic.	r6,r6,0		#test r6 and clear carry bit.
Packit c4476c
	beq	Lppcasm_add_adios
Packit c4476c
	addi	r4,r4,-$BNSZ
Packit c4476c
	addi	r3,r3,-$BNSZ
Packit c4476c
	addi	r5,r5,-$BNSZ
Packit c4476c
	mtctr	r6
Packit c4476c
Lppcasm_add_mainloop:
Packit c4476c
	$LDU	r7,$BNSZ(r4)
Packit c4476c
	$LDU	r8,$BNSZ(r5)
Packit c4476c
	adde	r8,r7,r8
Packit c4476c
	$STU	r8,$BNSZ(r3)
Packit c4476c
	bdnz	Lppcasm_add_mainloop
Packit c4476c
Lppcasm_add_adios:
Packit c4476c
	addze	r3,r0			#return carry bit.
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,4,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_add_words,.-.bn_add_words
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_div_words" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_div_words:
Packit c4476c
#
Packit c4476c
#	This is a cleaned up version of code generated by
Packit c4476c
#	the AIX compiler. The only optimization is to use
Packit c4476c
#	the PPC instruction to count leading zeros instead
Packit c4476c
#	of call to num_bits_word. Since this was compiled
Packit c4476c
#	only at level -O2 we can possibly squeeze it more?
Packit c4476c
#
Packit c4476c
#	r3 = h
Packit c4476c
#	r4 = l
Packit c4476c
#	r5 = d
Packit c4476c
Packit c4476c
	$UCMPI	0,r5,0			# compare r5 and 0
Packit c4476c
	bne	Lppcasm_div1		# proceed if d!=0
Packit c4476c
	li	r3,-1			# d=0 return -1
Packit c4476c
	blr
Packit c4476c
Lppcasm_div1:
Packit c4476c
	xor	r0,r0,r0		#r0=0
Packit c4476c
	li	r8,$BITS
Packit c4476c
	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
Packit c4476c
	beq	Lppcasm_div2		#proceed if no leading zeros
Packit c4476c
	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
Packit c4476c
	$SHR.	r9,r3,r8		#are there any bits above r8'th?
Packit c4476c
	$TR	16,r9,r0		#if there're, signal to dump core...
Packit c4476c
Lppcasm_div2:
Packit c4476c
	$UCMP	0,r3,r5			#h>=d?
Packit c4476c
	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
Packit c4476c
	subf	r3,r5,r3		#h-=d ;
Packit c4476c
Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
Packit c4476c
	cmpi	0,0,r7,0		# is (i == 0)?
Packit c4476c
	beq	Lppcasm_div4
Packit c4476c
	$SHL	r3,r3,r7		# h = (h<< i)
Packit c4476c
	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
Packit c4476c
	$SHL	r5,r5,r7		# d<<=i
Packit c4476c
	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
Packit c4476c
	$SHL	r4,r4,r7		# l <<=i
Packit c4476c
Lppcasm_div4:
Packit c4476c
	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
Packit c4476c
					# dl will be computed when needed
Packit c4476c
					# as it saves registers.
Packit c4476c
	li	r6,2			#r6=2
Packit c4476c
	mtctr	r6			#counter will be in count.
Packit c4476c
Lppcasm_divouterloop:
Packit c4476c
	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
Packit c4476c
	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
Packit c4476c
					# compute here for innerloop.
Packit c4476c
	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
Packit c4476c
	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
Packit c4476c
Packit c4476c
	li	r8,-1
Packit c4476c
	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
Packit c4476c
	b	Lppcasm_div6
Packit c4476c
Lppcasm_div5:
Packit c4476c
	$UDIV	r8,r3,r9		#q = h/dh
Packit c4476c
Lppcasm_div6:
Packit c4476c
	$UMULL	r12,r9,r8		#th = q*dh
Packit c4476c
	$CLRU	r10,r5,`$BITS/2`	#r10=dl
Packit c4476c
	$UMULL	r6,r8,r10		#tl = q*dl
Packit c4476c
Packit c4476c
Lppcasm_divinnerloop:
Packit c4476c
	subf	r10,r12,r3		#t = h -th
Packit c4476c
	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
Packit c4476c
	addic.	r7,r7,0			#test if r7 == 0. used below.
Packit c4476c
					# now want to compute
Packit c4476c
					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
Packit c4476c
					# the following 2 instructions do that
Packit c4476c
	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<
Packit c4476c
	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
Packit c4476c
	$UCMP	cr1,r6,r7		# compare (tl <= r7)
Packit c4476c
	bne	Lppcasm_divinnerexit
Packit c4476c
	ble	cr1,Lppcasm_divinnerexit
Packit c4476c
	addi	r8,r8,-1		#q--
Packit c4476c
	subf	r12,r9,r12		#th -=dh
Packit c4476c
	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
Packit c4476c
	subf	r6,r10,r6		#tl -=dl
Packit c4476c
	b	Lppcasm_divinnerloop
Packit c4476c
Lppcasm_divinnerexit:
Packit c4476c
	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
Packit c4476c
	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<
Packit c4476c
	$UCMP	cr1,r4,r11		# compare l and tl
Packit c4476c
	add	r12,r12,r10		# th+=t
Packit c4476c
	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
Packit c4476c
	addi	r12,r12,1		# th++
Packit c4476c
Lppcasm_div7:
Packit c4476c
	subf	r11,r11,r4		#r11=l-tl
Packit c4476c
	$UCMP	cr1,r3,r12		#compare h and th
Packit c4476c
	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
Packit c4476c
	addi	r8,r8,-1		# q--
Packit c4476c
	add	r3,r5,r3		# h+=d
Packit c4476c
Lppcasm_div8:
Packit c4476c
	subf	r12,r12,r3		#r12 = h-th
Packit c4476c
	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<
Packit c4476c
					# want to compute
Packit c4476c
					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
Packit c4476c
					# the following 2 instructions will do this.
Packit c4476c
	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
Packit c4476c
	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
Packit c4476c
	bdz	Lppcasm_div9		#if (count==0) break ;
Packit c4476c
	$SHLI	r0,r8,`$BITS/2`		#ret =q<
Packit c4476c
	b	Lppcasm_divouterloop
Packit c4476c
Lppcasm_div9:
Packit c4476c
	or	r3,r8,r0
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,3,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_div_words,.-.bn_div_words
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_sqr_words" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
.align	4
Packit c4476c
.bn_sqr_words:
Packit c4476c
#
Packit c4476c
#	Optimized version of bn_sqr_words
Packit c4476c
#
Packit c4476c
#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
Packit c4476c
#
Packit c4476c
#	r3 = r
Packit c4476c
#	r4 = a
Packit c4476c
#	r5 = n
Packit c4476c
#
Packit c4476c
#	r6 = a[i].
Packit c4476c
#	r7,r8 = product.
Packit c4476c
#
Packit c4476c
#	No unrolling done here. Not performance critical.
Packit c4476c
Packit c4476c
	addic.	r5,r5,0			#test r5.
Packit c4476c
	beq	Lppcasm_sqr_adios
Packit c4476c
	addi	r4,r4,-$BNSZ
Packit c4476c
	addi	r3,r3,-$BNSZ
Packit c4476c
	mtctr	r5
Packit c4476c
Lppcasm_sqr_mainloop:
Packit c4476c
					#sqr(r[0],r[1],a[0]);
Packit c4476c
	$LDU	r6,$BNSZ(r4)
Packit c4476c
	$UMULL	r7,r6,r6
Packit c4476c
	$UMULH  r8,r6,r6
Packit c4476c
	$STU	r7,$BNSZ(r3)
Packit c4476c
	$STU	r8,$BNSZ(r3)
Packit c4476c
	bdnz	Lppcasm_sqr_mainloop
Packit c4476c
Lppcasm_sqr_adios:
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,3,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_sqr_words,.-.bn_sqr_words
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_mul_words" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_mul_words:
Packit c4476c
#
Packit c4476c
# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
Packit c4476c
#
Packit c4476c
# r3 = rp
Packit c4476c
# r4 = ap
Packit c4476c
# r5 = num
Packit c4476c
# r6 = w
Packit c4476c
	xor	r0,r0,r0
Packit c4476c
	xor	r12,r12,r12		# used for carry
Packit c4476c
	rlwinm.	r7,r5,30,2,31		# num >> 2
Packit c4476c
	beq	Lppcasm_mw_REM
Packit c4476c
	mtctr	r7
Packit c4476c
Lppcasm_mw_LOOP:
Packit c4476c
					#mul(rp[0],ap[0],w,c1);
Packit c4476c
	$LD	r8,`0*$BNSZ`(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	#addze	r10,r10			#carry is NOT ignored.
Packit c4476c
					#will be taken care of
Packit c4476c
					#in second spin below
Packit c4476c
					#using adde.
Packit c4476c
	$ST	r9,`0*$BNSZ`(r3)
Packit c4476c
					#mul(rp[1],ap[1],w,c1);
Packit c4476c
	$LD	r8,`1*$BNSZ`(r4)
Packit c4476c
	$UMULL	r11,r6,r8
Packit c4476c
	$UMULH  r12,r6,r8
Packit c4476c
	adde	r11,r11,r10
Packit c4476c
	#addze	r12,r12
Packit c4476c
	$ST	r11,`1*$BNSZ`(r3)
Packit c4476c
					#mul(rp[2],ap[2],w,c1);
Packit c4476c
	$LD	r8,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	adde	r9,r9,r12
Packit c4476c
	#addze	r10,r10
Packit c4476c
	$ST	r9,`2*$BNSZ`(r3)
Packit c4476c
					#mul_add(rp[3],ap[3],w,c1);
Packit c4476c
	$LD	r8,`3*$BNSZ`(r4)
Packit c4476c
	$UMULL	r11,r6,r8
Packit c4476c
	$UMULH  r12,r6,r8
Packit c4476c
	adde	r11,r11,r10
Packit c4476c
	addze	r12,r12			#this spin we collect carry into
Packit c4476c
					#r12
Packit c4476c
	$ST	r11,`3*$BNSZ`(r3)
Packit c4476c
Packit c4476c
	addi	r3,r3,`4*$BNSZ`
Packit c4476c
	addi	r4,r4,`4*$BNSZ`
Packit c4476c
	bdnz	Lppcasm_mw_LOOP
Packit c4476c
Packit c4476c
Lppcasm_mw_REM:
Packit c4476c
	andi.	r5,r5,0x3
Packit c4476c
	beq	Lppcasm_mw_OVER
Packit c4476c
					#mul(rp[0],ap[0],w,c1);
Packit c4476c
	$LD	r8,`0*$BNSZ`(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r9,`0*$BNSZ`(r3)
Packit c4476c
	addi	r12,r10,0
Packit c4476c
Packit c4476c
	addi	r5,r5,-1
Packit c4476c
	cmpli	0,0,r5,0
Packit c4476c
	beq	Lppcasm_mw_OVER
Packit c4476c
Packit c4476c
Packit c4476c
					#mul(rp[1],ap[1],w,c1);
Packit c4476c
	$LD	r8,`1*$BNSZ`(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r9,`1*$BNSZ`(r3)
Packit c4476c
	addi	r12,r10,0
Packit c4476c
Packit c4476c
	addi	r5,r5,-1
Packit c4476c
	cmpli	0,0,r5,0
Packit c4476c
	beq	Lppcasm_mw_OVER
Packit c4476c
Packit c4476c
					#mul_add(rp[2],ap[2],w,c1);
Packit c4476c
	$LD	r8,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
	$ST	r9,`2*$BNSZ`(r3)
Packit c4476c
	addi	r12,r10,0
Packit c4476c
Packit c4476c
Lppcasm_mw_OVER:
Packit c4476c
	addi	r3,r12,0
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,4,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_mul_words,.-.bn_mul_words
Packit c4476c
Packit c4476c
#
Packit c4476c
#	NOTE:	The following label name should be changed to
Packit c4476c
#		"bn_mul_add_words" i.e. remove the first dot
Packit c4476c
#		for the gcc compiler. This should be automatically
Packit c4476c
#		done in the build
Packit c4476c
#
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.bn_mul_add_words:
Packit c4476c
#
Packit c4476c
# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
Packit c4476c
#
Packit c4476c
# r3 = rp
Packit c4476c
# r4 = ap
Packit c4476c
# r5 = num
Packit c4476c
# r6 = w
Packit c4476c
#
Packit c4476c
# empirical evidence suggests that unrolled version performs best!!
Packit c4476c
#
Packit c4476c
	xor	r0,r0,r0		#r0 = 0
Packit c4476c
	xor	r12,r12,r12  		#r12 = 0 . used for carry
Packit c4476c
	rlwinm.	r7,r5,30,2,31		# num >> 2
Packit c4476c
	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
Packit c4476c
	mtctr	r7
Packit c4476c
Lppcasm_maw_mainloop:
Packit c4476c
					#mul_add(rp[0],ap[0],w,c1);
Packit c4476c
	$LD	r8,`0*$BNSZ`(r4)
Packit c4476c
	$LD	r11,`0*$BNSZ`(r3)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	addc	r9,r9,r12		#r12 is carry.
Packit c4476c
	addze	r10,r10
Packit c4476c
	addc	r9,r9,r11
Packit c4476c
	#addze	r10,r10
Packit c4476c
					#the above instruction addze
Packit c4476c
					#is NOT needed. Carry will NOT
Packit c4476c
					#be ignored. It's not affected
Packit c4476c
					#by multiply and will be collected
Packit c4476c
					#in the next spin
Packit c4476c
	$ST	r9,`0*$BNSZ`(r3)
Packit c4476c
Packit c4476c
					#mul_add(rp[1],ap[1],w,c1);
Packit c4476c
	$LD	r8,`1*$BNSZ`(r4)
Packit c4476c
	$LD	r9,`1*$BNSZ`(r3)
Packit c4476c
	$UMULL	r11,r6,r8
Packit c4476c
	$UMULH  r12,r6,r8
Packit c4476c
	adde	r11,r11,r10		#r10 is carry.
Packit c4476c
	addze	r12,r12
Packit c4476c
	addc	r11,r11,r9
Packit c4476c
	#addze	r12,r12
Packit c4476c
	$ST	r11,`1*$BNSZ`(r3)
Packit c4476c
Packit c4476c
					#mul_add(rp[2],ap[2],w,c1);
Packit c4476c
	$LD	r8,`2*$BNSZ`(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$LD	r11,`2*$BNSZ`(r3)
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	adde	r9,r9,r12
Packit c4476c
	addze	r10,r10
Packit c4476c
	addc	r9,r9,r11
Packit c4476c
	#addze	r10,r10
Packit c4476c
	$ST	r9,`2*$BNSZ`(r3)
Packit c4476c
Packit c4476c
					#mul_add(rp[3],ap[3],w,c1);
Packit c4476c
	$LD	r8,`3*$BNSZ`(r4)
Packit c4476c
	$UMULL	r11,r6,r8
Packit c4476c
	$LD	r9,`3*$BNSZ`(r3)
Packit c4476c
	$UMULH  r12,r6,r8
Packit c4476c
	adde	r11,r11,r10
Packit c4476c
	addze	r12,r12
Packit c4476c
	addc	r11,r11,r9
Packit c4476c
	addze	r12,r12
Packit c4476c
	$ST	r11,`3*$BNSZ`(r3)
Packit c4476c
	addi	r3,r3,`4*$BNSZ`
Packit c4476c
	addi	r4,r4,`4*$BNSZ`
Packit c4476c
	bdnz	Lppcasm_maw_mainloop
Packit c4476c
Packit c4476c
Lppcasm_maw_leftover:
Packit c4476c
	andi.	r5,r5,0x3
Packit c4476c
	beq	Lppcasm_maw_adios
Packit c4476c
	addi	r3,r3,-$BNSZ
Packit c4476c
	addi	r4,r4,-$BNSZ
Packit c4476c
					#mul_add(rp[0],ap[0],w,c1);
Packit c4476c
	mtctr	r5
Packit c4476c
	$LDU	r8,$BNSZ(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	$LDU	r11,$BNSZ(r3)
Packit c4476c
	addc	r9,r9,r11
Packit c4476c
	addze	r10,r10
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	addze	r12,r10
Packit c4476c
	$ST	r9,0(r3)
Packit c4476c
Packit c4476c
	bdz	Lppcasm_maw_adios
Packit c4476c
					#mul_add(rp[1],ap[1],w,c1);
Packit c4476c
	$LDU	r8,$BNSZ(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	$LDU	r11,$BNSZ(r3)
Packit c4476c
	addc	r9,r9,r11
Packit c4476c
	addze	r10,r10
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	addze	r12,r10
Packit c4476c
	$ST	r9,0(r3)
Packit c4476c
Packit c4476c
	bdz	Lppcasm_maw_adios
Packit c4476c
					#mul_add(rp[2],ap[2],w,c1);
Packit c4476c
	$LDU	r8,$BNSZ(r4)
Packit c4476c
	$UMULL	r9,r6,r8
Packit c4476c
	$UMULH  r10,r6,r8
Packit c4476c
	$LDU	r11,$BNSZ(r3)
Packit c4476c
	addc	r9,r9,r11
Packit c4476c
	addze	r10,r10
Packit c4476c
	addc	r9,r9,r12
Packit c4476c
	addze	r12,r10
Packit c4476c
	$ST	r9,0(r3)
Packit c4476c
Packit c4476c
Lppcasm_maw_adios:
Packit c4476c
	addi	r3,r12,0
Packit c4476c
	blr
Packit c4476c
	.long	0
Packit c4476c
	.byte	0,12,0x14,0,0,0,4,0
Packit c4476c
	.long	0
Packit c4476c
.size	.bn_mul_add_words,.-.bn_mul_add_words
Packit c4476c
	.align	4
Packit c4476c
EOF
Packit c4476c
$data =~ s/\`([^\`]*)\`/eval $1/gem;
Packit c4476c
print $data;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";