Blame crypto/modes/asm/ghash-alpha.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# March 2010
Packit c4476c
#
Packit c4476c
# The module implements "4-bit" GCM GHASH function and underlying
Packit c4476c
# single multiplication operation in GF(2^128). "4-bit" means that it
Packit c4476c
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
Packit c4476c
# loops are aggressively modulo-scheduled in respect to references to
Packit c4476c
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
Packit c4476c
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
Packit c4476c
# scheduling "glitch," because uprofile(1) indicates uniform sample
Packit c4476c
# distribution, as if all instruction bundles execute in 1.5 cycles.
Packit c4476c
# Meaning that it could have been even faster, yet 12 cycles is ~60%
Packit c4476c
# better than gcc-generated code and ~80% than code generated by vendor
Packit c4476c
# compiler.
Packit c4476c
Packit c4476c
$cnt="v0";	# $0
Packit c4476c
$t0="t0";
Packit c4476c
$t1="t1";
Packit c4476c
$t2="t2";
Packit c4476c
$Thi0="t3";	# $4
Packit c4476c
$Tlo0="t4";
Packit c4476c
$Thi1="t5";
Packit c4476c
$Tlo1="t6";
Packit c4476c
$rem="t7";	# $8
Packit c4476c
#################
Packit c4476c
$Xi="a0";	# $16, input argument block
Packit c4476c
$Htbl="a1";
Packit c4476c
$inp="a2";
Packit c4476c
$len="a3";
Packit c4476c
$nlo="a4";	# $20
Packit c4476c
$nhi="a5";
Packit c4476c
$Zhi="t8";
Packit c4476c
$Zlo="t9";
Packit c4476c
$Xhi="t10";	# $24
Packit c4476c
$Xlo="t11";
Packit c4476c
$remp="t12";
Packit c4476c
$rem_4bit="AT";	# $28
Packit c4476c
Packit c4476c
{ my $N;
Packit c4476c
  sub loop() {
Packit c4476c
Packit c4476c
	$N++;
Packit c4476c
$code.=<<___;
Packit c4476c
.align	4
Packit c4476c
	extbl	$Xlo,7,$nlo
Packit c4476c
	and	$nlo,0xf0,$nhi
Packit c4476c
	sll	$nlo,4,$nlo
Packit c4476c
	and	$nlo,0xf0,$nlo
Packit c4476c
Packit c4476c
	addq	$nlo,$Htbl,$nlo
Packit c4476c
	ldq	$Zlo,8($nlo)
Packit c4476c
	addq	$nhi,$Htbl,$nhi
Packit c4476c
	ldq	$Zhi,0($nlo)
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	lda	$cnt,6(zero)
Packit c4476c
	extbl	$Xlo,6,$nlo
Packit c4476c
Packit c4476c
	ldq	$Tlo1,8($nhi)
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
	ldq	$Thi1,0($nhi)
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
	and	$nlo,0xf0,$nhi
Packit c4476c
Packit c4476c
	xor	$Tlo1,$Zlo,$Zlo
Packit c4476c
	sll	$nlo,4,$nlo
Packit c4476c
	xor	$Thi1,$Zhi,$Zhi
Packit c4476c
	and	$nlo,0xf0,$nlo
Packit c4476c
Packit c4476c
	addq	$nlo,$Htbl,$nlo
Packit c4476c
	ldq	$Tlo0,8($nlo)
Packit c4476c
	addq	$nhi,$Htbl,$nhi
Packit c4476c
	ldq	$Thi0,0($nlo)
Packit c4476c
Packit c4476c
.Looplo$N:
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	subq	$cnt,1,$cnt
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	ldq	$Tlo1,8($nhi)
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi1,0($nhi)
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
	extbl	$Xlo,$cnt,$nlo
Packit c4476c
Packit c4476c
	and	$nlo,0xf0,$nhi
Packit c4476c
	xor	$Thi0,$Zhi,$Zhi
Packit c4476c
	xor	$Tlo0,$Zlo,$Zlo
Packit c4476c
	sll	$nlo,4,$nlo
Packit c4476c
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	and	$nlo,0xf0,$nlo
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	addq	$nlo,$Htbl,$nlo
Packit c4476c
	addq	$nhi,$Htbl,$nhi
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	ldq	$Tlo0,8($nlo)
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
Packit c4476c
	xor	$Tlo1,$Zlo,$Zlo
Packit c4476c
	xor	$Thi1,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi0,0($nlo)
Packit c4476c
	bne	$cnt,.Looplo$N
Packit c4476c
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	lda	$cnt,7(zero)
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	ldq	$Tlo1,8($nhi)
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi1,0($nhi)
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
	extbl	$Xhi,$cnt,$nlo
Packit c4476c
Packit c4476c
	and	$nlo,0xf0,$nhi
Packit c4476c
	xor	$Thi0,$Zhi,$Zhi
Packit c4476c
	xor	$Tlo0,$Zlo,$Zlo
Packit c4476c
	sll	$nlo,4,$nlo
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	and	$nlo,0xf0,$nlo
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	addq	$nlo,$Htbl,$nlo
Packit c4476c
	addq	$nhi,$Htbl,$nhi
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	ldq	$Tlo0,8($nlo)
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
Packit c4476c
	xor	$Tlo1,$Zlo,$Zlo
Packit c4476c
	xor	$Thi1,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi0,0($nlo)
Packit c4476c
	unop
Packit c4476c
Packit c4476c
Packit c4476c
.Loophi$N:
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	subq	$cnt,1,$cnt
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	ldq	$Tlo1,8($nhi)
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi1,0($nhi)
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
	extbl	$Xhi,$cnt,$nlo
Packit c4476c
Packit c4476c
	and	$nlo,0xf0,$nhi
Packit c4476c
	xor	$Thi0,$Zhi,$Zhi
Packit c4476c
	xor	$Tlo0,$Zlo,$Zlo
Packit c4476c
	sll	$nlo,4,$nlo
Packit c4476c
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	and	$nlo,0xf0,$nlo
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	addq	$nlo,$Htbl,$nlo
Packit c4476c
	addq	$nhi,$Htbl,$nhi
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	ldq	$Tlo0,8($nlo)
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
Packit c4476c
	xor	$Tlo1,$Zlo,$Zlo
Packit c4476c
	xor	$Thi1,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi0,0($nlo)
Packit c4476c
	bne	$cnt,.Loophi$N
Packit c4476c
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	ldq	$Tlo1,8($nhi)
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
	ldq	$Thi1,0($nhi)
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
Packit c4476c
	xor	$Tlo0,$Zlo,$Zlo
Packit c4476c
	xor	$Thi0,$Zhi,$Zhi
Packit c4476c
Packit c4476c
	and	$Zlo,0x0f,$remp
Packit c4476c
	sll	$Zhi,60,$t0
Packit c4476c
	srl	$Zlo,4,$Zlo
Packit c4476c
Packit c4476c
	s8addq	$remp,$rem_4bit,$remp
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
Packit c4476c
	ldq	$rem,0($remp)
Packit c4476c
	srl	$Zhi,4,$Zhi
Packit c4476c
	xor	$Tlo1,$Zlo,$Zlo
Packit c4476c
	xor	$Thi1,$Zhi,$Zhi
Packit c4476c
	xor	$t0,$Zlo,$Zlo
Packit c4476c
	xor	$rem,$Zhi,$Zhi
Packit c4476c
___
Packit c4476c
}}
Packit c4476c
Packit c4476c
$code=<<___;
Packit c4476c
#ifdef __linux__
Packit c4476c
#include <asm/regdef.h>
Packit c4476c
#else
Packit c4476c
#include <asm.h>
Packit c4476c
#include <regdef.h>
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.text
Packit c4476c
Packit c4476c
.set	noat
Packit c4476c
.set	noreorder
Packit c4476c
.globl	gcm_gmult_4bit
Packit c4476c
.align	4
Packit c4476c
.ent	gcm_gmult_4bit
Packit c4476c
gcm_gmult_4bit:
Packit c4476c
	.frame	sp,0,ra
Packit c4476c
	.prologue 0
Packit c4476c
Packit c4476c
	ldq	$Xlo,8($Xi)
Packit c4476c
	ldq	$Xhi,0($Xi)
Packit c4476c
Packit c4476c
	bsr	$t0,picmeup
Packit c4476c
	nop
Packit c4476c
___
Packit c4476c
Packit c4476c
	&loop(;;
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
	srl	$Zlo,24,$t0	# byte swap
Packit c4476c
	srl	$Zlo,8,$t1
Packit c4476c
Packit c4476c
	sll	$Zlo,8,$t2
Packit c4476c
	sll	$Zlo,24,$Zlo
Packit c4476c
	zapnot	$t0,0x11,$t0
Packit c4476c
	zapnot	$t1,0x22,$t1
Packit c4476c
Packit c4476c
	zapnot	$Zlo,0x88,$Zlo
Packit c4476c
	or	$t0,$t1,$t0
Packit c4476c
	zapnot	$t2,0x44,$t2
Packit c4476c
Packit c4476c
	or	$Zlo,$t0,$Zlo
Packit c4476c
	srl	$Zhi,24,$t0
Packit c4476c
	srl	$Zhi,8,$t1
Packit c4476c
Packit c4476c
	or	$Zlo,$t2,$Zlo
Packit c4476c
	sll	$Zhi,8,$t2
Packit c4476c
	sll	$Zhi,24,$Zhi
Packit c4476c
Packit c4476c
	srl	$Zlo,32,$Xlo
Packit c4476c
	sll	$Zlo,32,$Zlo
Packit c4476c
Packit c4476c
	zapnot	$t0,0x11,$t0
Packit c4476c
	zapnot	$t1,0x22,$t1
Packit c4476c
	or	$Zlo,$Xlo,$Xlo
Packit c4476c
Packit c4476c
	zapnot	$Zhi,0x88,$Zhi
Packit c4476c
	or	$t0,$t1,$t0
Packit c4476c
	zapnot	$t2,0x44,$t2
Packit c4476c
Packit c4476c
	or	$Zhi,$t0,$Zhi
Packit c4476c
	or	$Zhi,$t2,$Zhi
Packit c4476c
Packit c4476c
	srl	$Zhi,32,$Xhi
Packit c4476c
	sll	$Zhi,32,$Zhi
Packit c4476c
Packit c4476c
	or	$Zhi,$Xhi,$Xhi
Packit c4476c
	stq	$Xlo,8($Xi)
Packit c4476c
	stq	$Xhi,0($Xi)
Packit c4476c
Packit c4476c
	ret	(ra)
Packit c4476c
.end	gcm_gmult_4bit
Packit c4476c
___
Packit c4476c
Packit c4476c
$inhi="s0";
Packit c4476c
$inlo="s1";
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	gcm_ghash_4bit
Packit c4476c
.align	4
Packit c4476c
.ent	gcm_ghash_4bit
Packit c4476c
gcm_ghash_4bit:
Packit c4476c
	lda	sp,-32(sp)
Packit c4476c
	stq	ra,0(sp)
Packit c4476c
	stq	s0,8(sp)
Packit c4476c
	stq	s1,16(sp)
Packit c4476c
	.mask	0x04000600,-32
Packit c4476c
	.frame	sp,32,ra
Packit c4476c
	.prologue 0
Packit c4476c
Packit c4476c
	ldq_u	$inhi,0($inp)
Packit c4476c
	ldq_u	$Thi0,7($inp)
Packit c4476c
	ldq_u	$inlo,8($inp)
Packit c4476c
	ldq_u	$Tlo0,15($inp)
Packit c4476c
	ldq	$Xhi,0($Xi)
Packit c4476c
	ldq	$Xlo,8($Xi)
Packit c4476c
Packit c4476c
	bsr	$t0,picmeup
Packit c4476c
	nop
Packit c4476c
Packit c4476c
.Louter:
Packit c4476c
	extql	$inhi,$inp,$inhi
Packit c4476c
	extqh	$Thi0,$inp,$Thi0
Packit c4476c
	or	$inhi,$Thi0,$inhi
Packit c4476c
	lda	$inp,16($inp)
Packit c4476c
Packit c4476c
	extql	$inlo,$inp,$inlo
Packit c4476c
	extqh	$Tlo0,$inp,$Tlo0
Packit c4476c
	or	$inlo,$Tlo0,$inlo
Packit c4476c
	subq	$len,16,$len
Packit c4476c
Packit c4476c
	xor	$Xlo,$inlo,$Xlo
Packit c4476c
	xor	$Xhi,$inhi,$Xhi
Packit c4476c
___
Packit c4476c
Packit c4476c
	&loop(;;
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
	srl	$Zlo,24,$t0	# byte swap
Packit c4476c
	srl	$Zlo,8,$t1
Packit c4476c
Packit c4476c
	sll	$Zlo,8,$t2
Packit c4476c
	sll	$Zlo,24,$Zlo
Packit c4476c
	zapnot	$t0,0x11,$t0
Packit c4476c
	zapnot	$t1,0x22,$t1
Packit c4476c
Packit c4476c
	zapnot	$Zlo,0x88,$Zlo
Packit c4476c
	or	$t0,$t1,$t0
Packit c4476c
	zapnot	$t2,0x44,$t2
Packit c4476c
Packit c4476c
	or	$Zlo,$t0,$Zlo
Packit c4476c
	srl	$Zhi,24,$t0
Packit c4476c
	srl	$Zhi,8,$t1
Packit c4476c
Packit c4476c
	or	$Zlo,$t2,$Zlo
Packit c4476c
	sll	$Zhi,8,$t2
Packit c4476c
	sll	$Zhi,24,$Zhi
Packit c4476c
Packit c4476c
	srl	$Zlo,32,$Xlo
Packit c4476c
	sll	$Zlo,32,$Zlo
Packit c4476c
	beq	$len,.Ldone
Packit c4476c
Packit c4476c
	zapnot	$t0,0x11,$t0
Packit c4476c
	zapnot	$t1,0x22,$t1
Packit c4476c
	or	$Zlo,$Xlo,$Xlo
Packit c4476c
	ldq_u	$inhi,0($inp)
Packit c4476c
Packit c4476c
	zapnot	$Zhi,0x88,$Zhi
Packit c4476c
	or	$t0,$t1,$t0
Packit c4476c
	zapnot	$t2,0x44,$t2
Packit c4476c
	ldq_u	$Thi0,7($inp)
Packit c4476c
Packit c4476c
	or	$Zhi,$t0,$Zhi
Packit c4476c
	or	$Zhi,$t2,$Zhi
Packit c4476c
	ldq_u	$inlo,8($inp)
Packit c4476c
	ldq_u	$Tlo0,15($inp)
Packit c4476c
Packit c4476c
	srl	$Zhi,32,$Xhi
Packit c4476c
	sll	$Zhi,32,$Zhi
Packit c4476c
Packit c4476c
	or	$Zhi,$Xhi,$Xhi
Packit c4476c
	br	zero,.Louter
Packit c4476c
Packit c4476c
.Ldone:
Packit c4476c
	zapnot	$t0,0x11,$t0
Packit c4476c
	zapnot	$t1,0x22,$t1
Packit c4476c
	or	$Zlo,$Xlo,$Xlo
Packit c4476c
Packit c4476c
	zapnot	$Zhi,0x88,$Zhi
Packit c4476c
	or	$t0,$t1,$t0
Packit c4476c
	zapnot	$t2,0x44,$t2
Packit c4476c
Packit c4476c
	or	$Zhi,$t0,$Zhi
Packit c4476c
	or	$Zhi,$t2,$Zhi
Packit c4476c
Packit c4476c
	srl	$Zhi,32,$Xhi
Packit c4476c
	sll	$Zhi,32,$Zhi
Packit c4476c
Packit c4476c
	or	$Zhi,$Xhi,$Xhi
Packit c4476c
Packit c4476c
	stq	$Xlo,8($Xi)
Packit c4476c
	stq	$Xhi,0($Xi)
Packit c4476c
Packit c4476c
	.set	noreorder
Packit c4476c
	/*ldq	ra,0(sp)*/
Packit c4476c
	ldq	s0,8(sp)
Packit c4476c
	ldq	s1,16(sp)
Packit c4476c
	lda	sp,32(sp)
Packit c4476c
	ret	(ra)
Packit c4476c
.end	gcm_ghash_4bit
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.ent	picmeup
Packit c4476c
picmeup:
Packit c4476c
	.frame	sp,0,$t0
Packit c4476c
	.prologue 0
Packit c4476c
	br	$rem_4bit,.Lpic
Packit c4476c
.Lpic:	lda	$rem_4bit,12($rem_4bit)
Packit c4476c
	ret	($t0)
Packit c4476c
.end	picmeup
Packit c4476c
	nop
Packit c4476c
rem_4bit:
Packit c4476c
	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
Packit c4476c
	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
Packit c4476c
	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
Packit c4476c
	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
Packit c4476c
.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
.align	4
Packit c4476c
Packit c4476c
___
Packit c4476c
$output=pop and open STDOUT,">$output";
Packit c4476c
print $code;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";
Packit c4476c