Blame crypto/sha/asm/sha512-armv4.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
#
Packit c4476c
# Permission to use under GPL terms is granted.
Packit c4476c
# ====================================================================
Packit c4476c
Packit c4476c
# SHA512 block procedure for ARMv4. September 2007.
Packit c4476c
Packit c4476c
# This code is ~4.5 (four and a half) times faster than code generated
Packit c4476c
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
Packit c4476c
# Xscale PXA250 core].
Packit c4476c
#
Packit c4476c
# July 2010.
Packit c4476c
#
Packit c4476c
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
Packit c4476c
# Cortex A8 core and ~40 cycles per processed byte.
Packit c4476c
Packit c4476c
# February 2011.
Packit c4476c
#
Packit c4476c
# Profiler-assisted and platform-specific optimization resulted in 7%
Packit c4476c
# improvement on Coxtex A8 core and ~38 cycles per byte.
Packit c4476c
Packit c4476c
# March 2011.
Packit c4476c
#
Packit c4476c
# Add NEON implementation. On Cortex A8 it was measured to process
Packit c4476c
# one byte in 23.3 cycles or ~60% faster than integer-only code.
Packit c4476c
Packit c4476c
# August 2012.
Packit c4476c
#
Packit c4476c
# Improve NEON performance by 12% on Snapdragon S4. In absolute
Packit c4476c
# terms it's 22.6 cycles per byte, which is disappointing result.
Packit c4476c
# Technical writers asserted that 3-way S4 pipeline can sustain
Packit c4476c
# multiple NEON instructions per cycle, but dual NEON issue could
Packit c4476c
# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
Packit c4476c
# for further details. On side note Cortex-A15 processes one byte in
Packit c4476c
# 16 cycles.
Packit c4476c
Packit c4476c
# Byte order [in]dependence. =========================================
Packit c4476c
#
Packit c4476c
# Originally caller was expected to maintain specific *dword* order in
Packit c4476c
# h[0-7], namely with most significant dword at *lower* address, which
Packit c4476c
# was reflected in below two parameters as 0 and 4. Now caller is
Packit c4476c
# expected to maintain native byte order for whole 64-bit values.
Packit c4476c
$hi="HI";
Packit c4476c
$lo="LO";
Packit c4476c
# ====================================================================
Packit c4476c
Packit c4476c
$flavour = shift;
Packit c4476c
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
Packit c4476c
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Packit c4476c
Packit c4476c
if ($flavour && $flavour ne "void") {
Packit c4476c
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Packit c4476c
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
Packit c4476c
    die "can't locate arm-xlate.pl";
Packit c4476c
Packit c4476c
    open STDOUT,"| \"$^X\" $xlate $flavour $output";
Packit c4476c
} else {
Packit c4476c
    open STDOUT,">$output";
Packit c4476c
}
Packit c4476c
Packit c4476c
$ctx="r0";	# parameter block
Packit c4476c
$inp="r1";
Packit c4476c
$len="r2";
Packit c4476c
Packit c4476c
$Tlo="r3";
Packit c4476c
$Thi="r4";
Packit c4476c
$Alo="r5";
Packit c4476c
$Ahi="r6";
Packit c4476c
$Elo="r7";
Packit c4476c
$Ehi="r8";
Packit c4476c
$t0="r9";
Packit c4476c
$t1="r10";
Packit c4476c
$t2="r11";
Packit c4476c
$t3="r12";
Packit c4476c
############	r13 is stack pointer
Packit c4476c
$Ktbl="r14";
Packit c4476c
############	r15 is program counter
Packit c4476c
Packit c4476c
$Aoff=8*0;
Packit c4476c
$Boff=8*1;
Packit c4476c
$Coff=8*2;
Packit c4476c
$Doff=8*3;
Packit c4476c
$Eoff=8*4;
Packit c4476c
$Foff=8*5;
Packit c4476c
$Goff=8*6;
Packit c4476c
$Hoff=8*7;
Packit c4476c
$Xoff=8*8;
Packit c4476c
Packit c4476c
sub BODY_00_15() {
Packit c4476c
my $magic = shift;
Packit c4476c
$code.=<<___;
Packit c4476c
	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
Packit c4476c
	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
Packit c4476c
	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
Packit c4476c
	mov	$t0,$Elo,lsr#14
Packit c4476c
	str	$Tlo,[sp,#$Xoff+0]
Packit c4476c
	mov	$t1,$Ehi,lsr#14
Packit c4476c
	str	$Thi,[sp,#$Xoff+4]
Packit c4476c
	eor	$t0,$t0,$Ehi,lsl#18
Packit c4476c
	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
Packit c4476c
	eor	$t1,$t1,$Elo,lsl#18
Packit c4476c
	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
Packit c4476c
	eor	$t0,$t0,$Elo,lsr#18
Packit c4476c
	eor	$t1,$t1,$Ehi,lsr#18
Packit c4476c
	eor	$t0,$t0,$Ehi,lsl#14
Packit c4476c
	eor	$t1,$t1,$Elo,lsl#14
Packit c4476c
	eor	$t0,$t0,$Ehi,lsr#9
Packit c4476c
	eor	$t1,$t1,$Elo,lsr#9
Packit c4476c
	eor	$t0,$t0,$Elo,lsl#23
Packit c4476c
	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
Packit c4476c
	adds	$Tlo,$Tlo,$t0
Packit c4476c
	ldr	$t0,[sp,#$Foff+0]	@ f.lo
Packit c4476c
	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
Packit c4476c
	ldr	$t1,[sp,#$Foff+4]	@ f.hi
Packit c4476c
	adds	$Tlo,$Tlo,$t2
Packit c4476c
	ldr	$t2,[sp,#$Goff+0]	@ g.lo
Packit c4476c
	adc	$Thi,$Thi,$t3		@ T += h
Packit c4476c
	ldr	$t3,[sp,#$Goff+4]	@ g.hi
Packit c4476c
Packit c4476c
	eor	$t0,$t0,$t2
Packit c4476c
	str	$Elo,[sp,#$Eoff+0]
Packit c4476c
	eor	$t1,$t1,$t3
Packit c4476c
	str	$Ehi,[sp,#$Eoff+4]
Packit c4476c
	and	$t0,$t0,$Elo
Packit c4476c
	str	$Alo,[sp,#$Aoff+0]
Packit c4476c
	and	$t1,$t1,$Ehi
Packit c4476c
	str	$Ahi,[sp,#$Aoff+4]
Packit c4476c
	eor	$t0,$t0,$t2
Packit c4476c
	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
Packit c4476c
	eor	$t1,$t1,$t3		@ Ch(e,f,g)
Packit c4476c
	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
Packit c4476c
Packit c4476c
	adds	$Tlo,$Tlo,$t0
Packit c4476c
	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
Packit c4476c
	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
Packit c4476c
	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
Packit c4476c
	adds	$Tlo,$Tlo,$t2
Packit c4476c
	and	$t0,$t2,#0xff
Packit c4476c
	adc	$Thi,$Thi,$t3		@ T += K[i]
Packit c4476c
	adds	$Elo,$Elo,$Tlo
Packit c4476c
	ldr	$t2,[sp,#$Boff+0]	@ b.lo
Packit c4476c
	adc	$Ehi,$Ehi,$Thi		@ d += T
Packit c4476c
	teq	$t0,#$magic
Packit c4476c
Packit c4476c
	ldr	$t3,[sp,#$Coff+0]	@ c.lo
Packit c4476c
#ifdef	__thumb2__
Packit c4476c
	it	eq			@ Thumb2 thing, sanity check in ARM
Packit c4476c
#endif
Packit c4476c
	orreq	$Ktbl,$Ktbl,#1
Packit c4476c
	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
Packit c4476c
	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
Packit c4476c
	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
Packit c4476c
	mov	$t0,$Alo,lsr#28
Packit c4476c
	mov	$t1,$Ahi,lsr#28
Packit c4476c
	eor	$t0,$t0,$Ahi,lsl#4
Packit c4476c
	eor	$t1,$t1,$Alo,lsl#4
Packit c4476c
	eor	$t0,$t0,$Ahi,lsr#2
Packit c4476c
	eor	$t1,$t1,$Alo,lsr#2
Packit c4476c
	eor	$t0,$t0,$Alo,lsl#30
Packit c4476c
	eor	$t1,$t1,$Ahi,lsl#30
Packit c4476c
	eor	$t0,$t0,$Ahi,lsr#7
Packit c4476c
	eor	$t1,$t1,$Alo,lsr#7
Packit c4476c
	eor	$t0,$t0,$Alo,lsl#25
Packit c4476c
	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
Packit c4476c
	adds	$Tlo,$Tlo,$t0
Packit c4476c
	and	$t0,$Alo,$t2
Packit c4476c
	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
Packit c4476c
Packit c4476c
	ldr	$t1,[sp,#$Boff+4]	@ b.hi
Packit c4476c
	orr	$Alo,$Alo,$t2
Packit c4476c
	ldr	$t2,[sp,#$Coff+4]	@ c.hi
Packit c4476c
	and	$Alo,$Alo,$t3
Packit c4476c
	and	$t3,$Ahi,$t1
Packit c4476c
	orr	$Ahi,$Ahi,$t1
Packit c4476c
	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
Packit c4476c
	and	$Ahi,$Ahi,$t2
Packit c4476c
	adds	$Alo,$Alo,$Tlo
Packit c4476c
	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
Packit c4476c
	sub	sp,sp,#8
Packit c4476c
	adc	$Ahi,$Ahi,$Thi		@ h += T
Packit c4476c
	tst	$Ktbl,#1
Packit c4476c
	add	$Ktbl,$Ktbl,#8
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code=<<___;
Packit c4476c
#ifndef __KERNEL__
Packit c4476c
# include "arm_arch.h"
Packit c4476c
# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
Packit c4476c
# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
Packit c4476c
#else
Packit c4476c
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
Packit c4476c
# define __ARM_MAX_ARCH__ 7
Packit c4476c
# define VFP_ABI_PUSH
Packit c4476c
# define VFP_ABI_POP
Packit c4476c
#endif
Packit c4476c
Packit c4476c
#ifdef __ARMEL__
Packit c4476c
# define LO 0
Packit c4476c
# define HI 4
Packit c4476c
# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
Packit c4476c
#else
Packit c4476c
# define HI 0
Packit c4476c
# define LO 4
Packit c4476c
# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.text
Packit c4476c
#if defined(__thumb2__)
Packit c4476c
.syntax unified
Packit c4476c
.thumb
Packit c4476c
# define adrl adr
Packit c4476c
#else
Packit c4476c
.code	32
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.type	K512,%object
Packit c4476c
.align	5
Packit c4476c
K512:
Packit c4476c
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
Packit c4476c
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
Packit c4476c
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
Packit c4476c
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
Packit c4476c
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
Packit c4476c
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
Packit c4476c
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
Packit c4476c
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
Packit c4476c
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
Packit c4476c
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
Packit c4476c
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
Packit c4476c
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
Packit c4476c
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
Packit c4476c
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
Packit c4476c
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
Packit c4476c
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
Packit c4476c
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
Packit c4476c
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
Packit c4476c
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
Packit c4476c
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
Packit c4476c
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
Packit c4476c
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
Packit c4476c
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
Packit c4476c
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
Packit c4476c
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
Packit c4476c
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
Packit c4476c
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
Packit c4476c
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
Packit c4476c
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
Packit c4476c
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
Packit c4476c
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
Packit c4476c
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
Packit c4476c
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
Packit c4476c
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
Packit c4476c
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
Packit c4476c
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
Packit c4476c
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
Packit c4476c
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
Packit c4476c
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
Packit c4476c
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
Packit c4476c
.size	K512,.-K512
Packit c4476c
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
Packit c4476c
.LOPENSSL_armcap:
Packit c4476c
.word	OPENSSL_armcap_P-.Lsha512_block_data_order
Packit c4476c
.skip	32-4
Packit c4476c
#else
Packit c4476c
.skip	32
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.global	sha512_block_data_order
Packit c4476c
.type	sha512_block_data_order,%function
Packit c4476c
sha512_block_data_order:
Packit c4476c
.Lsha512_block_data_order:
Packit c4476c
#if __ARM_ARCH__<7 && !defined(__thumb2__)
Packit c4476c
	sub	r3,pc,#8		@ sha512_block_data_order
Packit c4476c
#else
Packit c4476c
	adr	r3,.Lsha512_block_data_order
Packit c4476c
#endif
Packit c4476c
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
Packit c4476c
	ldr	r12,.LOPENSSL_armcap
Packit c4476c
	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
Packit c4476c
#ifdef	__APPLE__
Packit c4476c
	ldr	r12,[r12]
Packit c4476c
#endif
Packit c4476c
	tst	r12,#ARMV7_NEON
Packit c4476c
	bne	.LNEON
Packit c4476c
#endif
Packit c4476c
	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
Packit c4476c
	stmdb	sp!,{r4-r12,lr}
Packit c4476c
	sub	$Ktbl,r3,#672		@ K512
Packit c4476c
	sub	sp,sp,#9*8
Packit c4476c
Packit c4476c
	ldr	$Elo,[$ctx,#$Eoff+$lo]
Packit c4476c
	ldr	$Ehi,[$ctx,#$Eoff+$hi]
Packit c4476c
	ldr	$t0, [$ctx,#$Goff+$lo]
Packit c4476c
	ldr	$t1, [$ctx,#$Goff+$hi]
Packit c4476c
	ldr	$t2, [$ctx,#$Hoff+$lo]
Packit c4476c
	ldr	$t3, [$ctx,#$Hoff+$hi]
Packit c4476c
.Loop:
Packit c4476c
	str	$t0, [sp,#$Goff+0]
Packit c4476c
	str	$t1, [sp,#$Goff+4]
Packit c4476c
	str	$t2, [sp,#$Hoff+0]
Packit c4476c
	str	$t3, [sp,#$Hoff+4]
Packit c4476c
	ldr	$Alo,[$ctx,#$Aoff+$lo]
Packit c4476c
	ldr	$Ahi,[$ctx,#$Aoff+$hi]
Packit c4476c
	ldr	$Tlo,[$ctx,#$Boff+$lo]
Packit c4476c
	ldr	$Thi,[$ctx,#$Boff+$hi]
Packit c4476c
	ldr	$t0, [$ctx,#$Coff+$lo]
Packit c4476c
	ldr	$t1, [$ctx,#$Coff+$hi]
Packit c4476c
	ldr	$t2, [$ctx,#$Doff+$lo]
Packit c4476c
	ldr	$t3, [$ctx,#$Doff+$hi]
Packit c4476c
	str	$Tlo,[sp,#$Boff+0]
Packit c4476c
	str	$Thi,[sp,#$Boff+4]
Packit c4476c
	str	$t0, [sp,#$Coff+0]
Packit c4476c
	str	$t1, [sp,#$Coff+4]
Packit c4476c
	str	$t2, [sp,#$Doff+0]
Packit c4476c
	str	$t3, [sp,#$Doff+4]
Packit c4476c
	ldr	$Tlo,[$ctx,#$Foff+$lo]
Packit c4476c
	ldr	$Thi,[$ctx,#$Foff+$hi]
Packit c4476c
	str	$Tlo,[sp,#$Foff+0]
Packit c4476c
	str	$Thi,[sp,#$Foff+4]
Packit c4476c
Packit c4476c
.L00_15:
Packit c4476c
#if __ARM_ARCH__<7
Packit c4476c
	ldrb	$Tlo,[$inp,#7]
Packit c4476c
	ldrb	$t0, [$inp,#6]
Packit c4476c
	ldrb	$t1, [$inp,#5]
Packit c4476c
	ldrb	$t2, [$inp,#4]
Packit c4476c
	ldrb	$Thi,[$inp,#3]
Packit c4476c
	ldrb	$t3, [$inp,#2]
Packit c4476c
	orr	$Tlo,$Tlo,$t0,lsl#8
Packit c4476c
	ldrb	$t0, [$inp,#1]
Packit c4476c
	orr	$Tlo,$Tlo,$t1,lsl#16
Packit c4476c
	ldrb	$t1, [$inp],#8
Packit c4476c
	orr	$Tlo,$Tlo,$t2,lsl#24
Packit c4476c
	orr	$Thi,$Thi,$t3,lsl#8
Packit c4476c
	orr	$Thi,$Thi,$t0,lsl#16
Packit c4476c
	orr	$Thi,$Thi,$t1,lsl#24
Packit c4476c
#else
Packit c4476c
	ldr	$Tlo,[$inp,#4]
Packit c4476c
	ldr	$Thi,[$inp],#8
Packit c4476c
#ifdef __ARMEL__
Packit c4476c
	rev	$Tlo,$Tlo
Packit c4476c
	rev	$Thi,$Thi
Packit c4476c
#endif
Packit c4476c
#endif
Packit c4476c
___
Packit c4476c
	&BODY_00_15(0x94);
Packit c4476c
$code.=<<___;
Packit c4476c
	tst	$Ktbl,#1
Packit c4476c
	beq	.L00_15
Packit c4476c
	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
Packit c4476c
	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
Packit c4476c
	bic	$Ktbl,$Ktbl,#1
Packit c4476c
.L16_79:
Packit c4476c
	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
Packit c4476c
	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
Packit c4476c
	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
Packit c4476c
	mov	$Tlo,$t0,lsr#1
Packit c4476c
	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
Packit c4476c
	mov	$Thi,$t1,lsr#1
Packit c4476c
	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
Packit c4476c
	eor	$Tlo,$Tlo,$t1,lsl#31
Packit c4476c
	eor	$Thi,$Thi,$t0,lsl#31
Packit c4476c
	eor	$Tlo,$Tlo,$t0,lsr#8
Packit c4476c
	eor	$Thi,$Thi,$t1,lsr#8
Packit c4476c
	eor	$Tlo,$Tlo,$t1,lsl#24
Packit c4476c
	eor	$Thi,$Thi,$t0,lsl#24
Packit c4476c
	eor	$Tlo,$Tlo,$t0,lsr#7
Packit c4476c
	eor	$Thi,$Thi,$t1,lsr#7
Packit c4476c
	eor	$Tlo,$Tlo,$t1,lsl#25
Packit c4476c
Packit c4476c
	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
Packit c4476c
	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
Packit c4476c
	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
Packit c4476c
	mov	$t0,$t2,lsr#19
Packit c4476c
	mov	$t1,$t3,lsr#19
Packit c4476c
	eor	$t0,$t0,$t3,lsl#13
Packit c4476c
	eor	$t1,$t1,$t2,lsl#13
Packit c4476c
	eor	$t0,$t0,$t3,lsr#29
Packit c4476c
	eor	$t1,$t1,$t2,lsr#29
Packit c4476c
	eor	$t0,$t0,$t2,lsl#3
Packit c4476c
	eor	$t1,$t1,$t3,lsl#3
Packit c4476c
	eor	$t0,$t0,$t2,lsr#6
Packit c4476c
	eor	$t1,$t1,$t3,lsr#6
Packit c4476c
	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
Packit c4476c
	eor	$t0,$t0,$t3,lsl#26
Packit c4476c
Packit c4476c
	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
Packit c4476c
	adds	$Tlo,$Tlo,$t0
Packit c4476c
	ldr	$t0,[sp,#`$Xoff+8*16`+0]
Packit c4476c
	adc	$Thi,$Thi,$t1
Packit c4476c
Packit c4476c
	ldr	$t1,[sp,#`$Xoff+8*16`+4]
Packit c4476c
	adds	$Tlo,$Tlo,$t2
Packit c4476c
	adc	$Thi,$Thi,$t3
Packit c4476c
	adds	$Tlo,$Tlo,$t0
Packit c4476c
	adc	$Thi,$Thi,$t1
Packit c4476c
___
Packit c4476c
	&BODY_00_15(0x17);
Packit c4476c
$code.=<<___;
Packit c4476c
#ifdef	__thumb2__
Packit c4476c
	ittt	eq			@ Thumb2 thing, sanity check in ARM
Packit c4476c
#endif
Packit c4476c
	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
Packit c4476c
	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
Packit c4476c
	beq	.L16_79
Packit c4476c
	bic	$Ktbl,$Ktbl,#1
Packit c4476c
Packit c4476c
	ldr	$Tlo,[sp,#$Boff+0]
Packit c4476c
	ldr	$Thi,[sp,#$Boff+4]
Packit c4476c
	ldr	$t0, [$ctx,#$Aoff+$lo]
Packit c4476c
	ldr	$t1, [$ctx,#$Aoff+$hi]
Packit c4476c
	ldr	$t2, [$ctx,#$Boff+$lo]
Packit c4476c
	ldr	$t3, [$ctx,#$Boff+$hi]
Packit c4476c
	adds	$t0,$Alo,$t0
Packit c4476c
	str	$t0, [$ctx,#$Aoff+$lo]
Packit c4476c
	adc	$t1,$Ahi,$t1
Packit c4476c
	str	$t1, [$ctx,#$Aoff+$hi]
Packit c4476c
	adds	$t2,$Tlo,$t2
Packit c4476c
	str	$t2, [$ctx,#$Boff+$lo]
Packit c4476c
	adc	$t3,$Thi,$t3
Packit c4476c
	str	$t3, [$ctx,#$Boff+$hi]
Packit c4476c
Packit c4476c
	ldr	$Alo,[sp,#$Coff+0]
Packit c4476c
	ldr	$Ahi,[sp,#$Coff+4]
Packit c4476c
	ldr	$Tlo,[sp,#$Doff+0]
Packit c4476c
	ldr	$Thi,[sp,#$Doff+4]
Packit c4476c
	ldr	$t0, [$ctx,#$Coff+$lo]
Packit c4476c
	ldr	$t1, [$ctx,#$Coff+$hi]
Packit c4476c
	ldr	$t2, [$ctx,#$Doff+$lo]
Packit c4476c
	ldr	$t3, [$ctx,#$Doff+$hi]
Packit c4476c
	adds	$t0,$Alo,$t0
Packit c4476c
	str	$t0, [$ctx,#$Coff+$lo]
Packit c4476c
	adc	$t1,$Ahi,$t1
Packit c4476c
	str	$t1, [$ctx,#$Coff+$hi]
Packit c4476c
	adds	$t2,$Tlo,$t2
Packit c4476c
	str	$t2, [$ctx,#$Doff+$lo]
Packit c4476c
	adc	$t3,$Thi,$t3
Packit c4476c
	str	$t3, [$ctx,#$Doff+$hi]
Packit c4476c
Packit c4476c
	ldr	$Tlo,[sp,#$Foff+0]
Packit c4476c
	ldr	$Thi,[sp,#$Foff+4]
Packit c4476c
	ldr	$t0, [$ctx,#$Eoff+$lo]
Packit c4476c
	ldr	$t1, [$ctx,#$Eoff+$hi]
Packit c4476c
	ldr	$t2, [$ctx,#$Foff+$lo]
Packit c4476c
	ldr	$t3, [$ctx,#$Foff+$hi]
Packit c4476c
	adds	$Elo,$Elo,$t0
Packit c4476c
	str	$Elo,[$ctx,#$Eoff+$lo]
Packit c4476c
	adc	$Ehi,$Ehi,$t1
Packit c4476c
	str	$Ehi,[$ctx,#$Eoff+$hi]
Packit c4476c
	adds	$t2,$Tlo,$t2
Packit c4476c
	str	$t2, [$ctx,#$Foff+$lo]
Packit c4476c
	adc	$t3,$Thi,$t3
Packit c4476c
	str	$t3, [$ctx,#$Foff+$hi]
Packit c4476c
Packit c4476c
	ldr	$Alo,[sp,#$Goff+0]
Packit c4476c
	ldr	$Ahi,[sp,#$Goff+4]
Packit c4476c
	ldr	$Tlo,[sp,#$Hoff+0]
Packit c4476c
	ldr	$Thi,[sp,#$Hoff+4]
Packit c4476c
	ldr	$t0, [$ctx,#$Goff+$lo]
Packit c4476c
	ldr	$t1, [$ctx,#$Goff+$hi]
Packit c4476c
	ldr	$t2, [$ctx,#$Hoff+$lo]
Packit c4476c
	ldr	$t3, [$ctx,#$Hoff+$hi]
Packit c4476c
	adds	$t0,$Alo,$t0
Packit c4476c
	str	$t0, [$ctx,#$Goff+$lo]
Packit c4476c
	adc	$t1,$Ahi,$t1
Packit c4476c
	str	$t1, [$ctx,#$Goff+$hi]
Packit c4476c
	adds	$t2,$Tlo,$t2
Packit c4476c
	str	$t2, [$ctx,#$Hoff+$lo]
Packit c4476c
	adc	$t3,$Thi,$t3
Packit c4476c
	str	$t3, [$ctx,#$Hoff+$hi]
Packit c4476c
Packit c4476c
	add	sp,sp,#640
Packit c4476c
	sub	$Ktbl,$Ktbl,#640
Packit c4476c
Packit c4476c
	teq	$inp,$len
Packit c4476c
	bne	.Loop
Packit c4476c
Packit c4476c
	add	sp,sp,#8*9		@ destroy frame
Packit c4476c
#if __ARM_ARCH__>=5
Packit c4476c
	ldmia	sp!,{r4-r12,pc}
Packit c4476c
#else
Packit c4476c
	ldmia	sp!,{r4-r12,lr}
Packit c4476c
	tst	lr,#1
Packit c4476c
	moveq	pc,lr			@ be binary compatible with V4, yet
Packit c4476c
	bx	lr			@ interoperable with Thumb ISA:-)
Packit c4476c
#endif
Packit c4476c
.size	sha512_block_data_order,.-sha512_block_data_order
Packit c4476c
___
Packit c4476c
Packit c4476c
{
Packit c4476c
my @Sigma0=(28,34,39);
Packit c4476c
my @Sigma1=(14,18,41);
Packit c4476c
my @sigma0=(1, 8, 7);
Packit c4476c
my @sigma1=(19,61,6);
Packit c4476c
Packit c4476c
my $Ktbl="r3";
Packit c4476c
my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
Packit c4476c
Packit c4476c
my @X=map("d$_",(0..15));
Packit c4476c
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
Packit c4476c
Packit c4476c
sub NEON_00_15() {
Packit c4476c
my $i=shift;
Packit c4476c
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
Packit c4476c
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
Packit c4476c
Packit c4476c
$code.=<<___ if ($i<16 || $i&1;;
Packit c4476c
	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
Packit c4476c
#if $i<16
Packit c4476c
	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
Packit c4476c
#endif
Packit c4476c
	vshr.u64	$t1,$e,#@Sigma1[1]
Packit c4476c
#if $i>0
Packit c4476c
	 vadd.i64	$a,$Maj			@ h+=Maj from the past
Packit c4476c
#endif
Packit c4476c
	vshr.u64	$t2,$e,#@Sigma1[2]
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
Packit c4476c
	vsli.64		$t0,$e,#`64-@Sigma1[0]`
Packit c4476c
	vsli.64		$t1,$e,#`64-@Sigma1[1]`
Packit c4476c
	vmov		$Ch,$e
Packit c4476c
	vsli.64		$t2,$e,#`64-@Sigma1[2]`
Packit c4476c
#if $i<16 && defined(__ARMEL__)
Packit c4476c
	vrev64.8	@X[$i],@X[$i]
Packit c4476c
#endif
Packit c4476c
	veor		$t1,$t0
Packit c4476c
	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
Packit c4476c
	vshr.u64	$t0,$a,#@Sigma0[0]
Packit c4476c
	veor		$t2,$t1			@ Sigma1(e)
Packit c4476c
	vadd.i64	$T1,$Ch,$h
Packit c4476c
	vshr.u64	$t1,$a,#@Sigma0[1]
Packit c4476c
	vsli.64		$t0,$a,#`64-@Sigma0[0]`
Packit c4476c
	vadd.i64	$T1,$t2
Packit c4476c
	vshr.u64	$t2,$a,#@Sigma0[2]
Packit c4476c
	vadd.i64	$K,@X[$i%16]
Packit c4476c
	vsli.64		$t1,$a,#`64-@Sigma0[1]`
Packit c4476c
	veor		$Maj,$a,$b
Packit c4476c
	vsli.64		$t2,$a,#`64-@Sigma0[2]`
Packit c4476c
	veor		$h,$t0,$t1
Packit c4476c
	vadd.i64	$T1,$K
Packit c4476c
	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
Packit c4476c
	veor		$h,$t2			@ Sigma0(a)
Packit c4476c
	vadd.i64	$d,$T1
Packit c4476c
	vadd.i64	$Maj,$T1
Packit c4476c
	@ vadd.i64	$h,$Maj
Packit c4476c
___
Packit c4476c
}
Packit c4476c
Packit c4476c
sub NEON_16_79() {
Packit c4476c
my $i=shift;
Packit c4476c
Packit c4476c
if ($i&1)	{ &NEON_00_15($i,@_); return; }
Packit c4476c
Packit c4476c
# 2x-vectorized, therefore runs every 2nd round
Packit c4476c
my @X=map("q$_",(0..7));			# view @X as 128-bit vector
Packit c4476c
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
Packit c4476c
my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
Packit c4476c
my $e=@_[4];					# $e from NEON_00_15
Packit c4476c
$i /= 2;
Packit c4476c
$code.=<<___;
Packit c4476c
	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
Packit c4476c
	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
Packit c4476c
	 vadd.i64	@_[0],d30			@ h+=Maj from the past
Packit c4476c
	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
Packit c4476c
	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
Packit c4476c
	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
Packit c4476c
	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
Packit c4476c
	veor		$s1,$t0
Packit c4476c
	vshr.u64	$t0,$s0,#@sigma0[0]
Packit c4476c
	veor		$s1,$t1				@ sigma1(X[i+14])
Packit c4476c
	vshr.u64	$t1,$s0,#@sigma0[1]
Packit c4476c
	vadd.i64	@X[$i%8],$s1
Packit c4476c
	vshr.u64	$s1,$s0,#@sigma0[2]
Packit c4476c
	vsli.64		$t0,$s0,#`64-@sigma0[0]`
Packit c4476c
	vsli.64		$t1,$s0,#`64-@sigma0[1]`
Packit c4476c
	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
Packit c4476c
	veor		$s1,$t0
Packit c4476c
	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
Packit c4476c
	vadd.i64	@X[$i%8],$s0
Packit c4476c
	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
Packit c4476c
	veor		$s1,$t1				@ sigma0(X[i+1])
Packit c4476c
	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
Packit c4476c
	vadd.i64	@X[$i%8],$s1
Packit c4476c
___
Packit c4476c
	&NEON_00_15(2*$i,@_);
Packit c4476c
}
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
#if __ARM_MAX_ARCH__>=7
Packit c4476c
.arch	armv7-a
Packit c4476c
.fpu	neon
Packit c4476c
Packit c4476c
.global	sha512_block_data_order_neon
Packit c4476c
.type	sha512_block_data_order_neon,%function
Packit c4476c
.align	4
Packit c4476c
sha512_block_data_order_neon:
Packit c4476c
.LNEON:
Packit c4476c
	dmb				@ errata #451034 on early Cortex A8
Packit c4476c
	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
Packit c4476c
	adr	$Ktbl,K512
Packit c4476c
	VFP_ABI_PUSH
Packit c4476c
	vldmia	$ctx,{$A-$H}		@ load context
Packit c4476c
.Loop_neon:
Packit c4476c
___
Packit c4476c
for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
Packit c4476c
$code.=<<___;
Packit c4476c
	mov		$cnt,#4
Packit c4476c
.L16_79_neon:
Packit c4476c
	subs		$cnt,#1
Packit c4476c
___
Packit c4476c
for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
Packit c4476c
$code.=<<___;
Packit c4476c
	bne		.L16_79_neon
Packit c4476c
Packit c4476c
	 vadd.i64	$A,d30		@ h+=Maj from the past
Packit c4476c
	vldmia		$ctx,{d24-d31}	@ load context to temp
Packit c4476c
	vadd.i64	q8,q12		@ vectorized accumulate
Packit c4476c
	vadd.i64	q9,q13
Packit c4476c
	vadd.i64	q10,q14
Packit c4476c
	vadd.i64	q11,q15
Packit c4476c
	vstmia		$ctx,{$A-$H}	@ save context
Packit c4476c
	teq		$inp,$len
Packit c4476c
	sub		$Ktbl,#640	@ rewind K512
Packit c4476c
	bne		.Loop_neon
Packit c4476c
Packit c4476c
	VFP_ABI_POP
Packit c4476c
	ret				@ bx lr
Packit c4476c
.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
Packit c4476c
#endif
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
Packit c4476c
.align	2
Packit c4476c
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
Packit c4476c
.comm	OPENSSL_armcap_P,4,4
Packit c4476c
#endif
Packit c4476c
___
Packit c4476c
Packit c4476c
$code =~ s/\`([^\`]*)\`/eval $1/gem;
Packit c4476c
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
Packit c4476c
$code =~ s/\bret\b/bx	lr/gm;
Packit c4476c
Packit c4476c
open SELF,$0;
Packit c4476c
while(<SELF>) {
Packit c4476c
	next if (/^#!/);
Packit c4476c
	last if (!s/^#/@/ and !/^$/);
Packit c4476c
	print;
Packit c4476c
}
Packit c4476c
close SELF;
Packit c4476c
Packit c4476c
print $code;
Packit c4476c
close STDOUT or die "error closing STDOUT: $!"; # enforce flush