Blame crypto/aes/asm/aesv8-armx.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# This module implements support for ARMv8 AES instructions. The
Packit c4476c
# module is endian-agnostic in sense that it supports both big- and
Packit c4476c
# little-endian cases. As does it support both 32- and 64-bit modes
Packit c4476c
# of operation. Latter is achieved by limiting amount of utilized
Packit c4476c
# registers to 16, which implies additional NEON load and integer
Packit c4476c
# instructions. This has no effect on mighty Apple A7, where results
Packit c4476c
# are literally equal to the theoretical estimates based on AES
Packit c4476c
# instruction latencies and issue rates. On Cortex-A53, an in-order
Packit c4476c
# execution core, this costs up to 10-15%, which is partially
Packit c4476c
# compensated by implementing dedicated code path for 128-bit
Packit c4476c
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
Packit c4476c
# seems to be limited by sheer amount of NEON instructions...
Packit c4476c
#
Packit c4476c
# April 2019
Packit c4476c
#
Packit c4476c
# Key to performance of parallelize-able modes is round instruction
Packit c4476c
# interleaving. But which factor to use? There is optimal one for
Packit c4476c
# each combination of instruction latency and issue rate, beyond
Packit c4476c
# which increasing interleave factor doesn't pay off. While on cons
Packit c4476c
# side we have code size increase and resource waste on platforms for
Packit c4476c
# which interleave factor is too high. In other words you want it to
Packit c4476c
# be just right. So far interleave factor of 3x was serving well all
Packit c4476c
# platforms. But for ThunderX2 optimal interleave factor was measured
Packit c4476c
# to be 5x...
Packit c4476c
#
Packit c4476c
# Performance in cycles per byte processed with 128-bit key:
Packit c4476c
#
Packit c4476c
#		CBC enc		CBC dec		CTR
Packit c4476c
# Apple A7	2.39		1.20		1.20
Packit c4476c
# Cortex-A53	1.32		1.17/1.29(**)	1.36/1.46
Packit c4476c
# Cortex-A57(*)	1.95		0.82/0.85	0.89/0.93
Packit c4476c
# Cortex-A72	1.33		0.85/0.88	0.92/0.96
Packit c4476c
# Denver	1.96		0.65/0.86	0.76/0.80
Packit c4476c
# Mongoose	1.33		1.23/1.20	1.30/1.20
Packit c4476c
# Kryo		1.26		0.87/0.94	1.00/1.00
Packit c4476c
# ThunderX2	5.95		1.25		1.30
Packit c4476c
#
Packit c4476c
# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
Packit c4476c
#	and are still same even for updated module;
Packit c4476c
# (**)	numbers after slash are for 32-bit code, which is 3x-
Packit c4476c
#	interleaved;
Packit c4476c
Packit c4476c
# $output is the last argument if it looks like a file (it has an extension)
Packit c4476c
# $flavour is the first argument if it doesn't look like a file
Packit c4476c
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
Packit c4476c
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Packit c4476c
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
Packit c4476c
die "can't locate arm-xlate.pl";
Packit c4476c
Packit c4476c
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
Packit c4476c
    or die "can't call $xlate: $!";
Packit c4476c
*STDOUT=*OUT;
Packit c4476c
Packit c4476c
$prefix="aes_v8";
Packit c4476c
Packit c4476c
$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
Packit c4476c
Packit c4476c
$code=<<___;
Packit c4476c
#include "arm_arch.h"
Packit c4476c
Packit c4476c
#if __ARM_MAX_ARCH__>=7
Packit c4476c
___
Packit c4476c
$code.=".arch	armv8-a+crypto\n.text\n"		if ($flavour =~ /64/);
Packit c4476c
$code.=<<___						if ($flavour !~ /64/);
Packit c4476c
.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
Packit c4476c
.fpu	neon
Packit c4476c
#ifdef	__thumb2__
Packit c4476c
.syntax	unified
Packit c4476c
.thumb
Packit c4476c
# define INST(a,b,c,d)	$_byte	c,d|0xc,a,b
Packit c4476c
#else
Packit c4476c
.code	32
Packit c4476c
# define INST(a,b,c,d)	$_byte	a,b,c,d
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.text
Packit c4476c
___
Packit c4476c
Packit c4476c
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
Packit c4476c
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
Packit c4476c
# maintain both 32- and 64-bit codes within single module and
Packit c4476c
# transliterate common code to either flavour with regex vodoo.
Packit c4476c
#
Packit c4476c
{{{
Packit c4476c
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
Packit c4476c
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
Packit c4476c
	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
Packit c4476c
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.align	5
Packit c4476c
.Lrcon:
Packit c4476c
.long	0x01,0x01,0x01,0x01
Packit c4476c
.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
Packit c4476c
.long	0x1b,0x1b,0x1b,0x1b
Packit c4476c
Packit c4476c
.globl	${prefix}_set_encrypt_key
Packit c4476c
.type	${prefix}_set_encrypt_key,%function
Packit c4476c
.align	5
Packit c4476c
${prefix}_set_encrypt_key:
Packit c4476c
.Lenc_key:
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	stp	x29,x30,[sp,#-16]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	mov	$ptr,#-1
Packit c4476c
	cmp	$inp,#0
Packit c4476c
	b.eq	.Lenc_key_abort
Packit c4476c
	cmp	$out,#0
Packit c4476c
	b.eq	.Lenc_key_abort
Packit c4476c
	mov	$ptr,#-2
Packit c4476c
	cmp	$bits,#128
Packit c4476c
	b.lt	.Lenc_key_abort
Packit c4476c
	cmp	$bits,#256
Packit c4476c
	b.gt	.Lenc_key_abort
Packit c4476c
	tst	$bits,#0x3f
Packit c4476c
	b.ne	.Lenc_key_abort
Packit c4476c
Packit c4476c
	adr	$ptr,.Lrcon
Packit c4476c
	cmp	$bits,#192
Packit c4476c
Packit c4476c
	veor	$zero,$zero,$zero
Packit c4476c
	vld1.8	{$in0},[$inp],#16
Packit c4476c
	mov	$bits,#8		// reuse $bits
Packit c4476c
	vld1.32	{$rcon,$mask},[$ptr],#32
Packit c4476c
Packit c4476c
	b.lt	.Loop128
Packit c4476c
	b.eq	.L192
Packit c4476c
	b	.L256
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop128:
Packit c4476c
	vtbl.8	$key,{$in0},$mask
Packit c4476c
	vext.8	$tmp,$zero,$in0,#12
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
	aese	$key,$zero
Packit c4476c
	subs	$bits,$bits,#1
Packit c4476c
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	 veor	$key,$key,$rcon
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vshl.u8	$rcon,$rcon,#1
Packit c4476c
	veor	$in0,$in0,$key
Packit c4476c
	b.ne	.Loop128
Packit c4476c
Packit c4476c
	vld1.32	{$rcon},[$ptr]
Packit c4476c
Packit c4476c
	vtbl.8	$key,{$in0},$mask
Packit c4476c
	vext.8	$tmp,$zero,$in0,#12
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
	aese	$key,$zero
Packit c4476c
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	 veor	$key,$key,$rcon
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vshl.u8	$rcon,$rcon,#1
Packit c4476c
	veor	$in0,$in0,$key
Packit c4476c
Packit c4476c
	vtbl.8	$key,{$in0},$mask
Packit c4476c
	vext.8	$tmp,$zero,$in0,#12
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
	aese	$key,$zero
Packit c4476c
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	 veor	$key,$key,$rcon
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	veor	$in0,$in0,$key
Packit c4476c
	vst1.32	{$in0},[$out]
Packit c4476c
	add	$out,$out,#0x50
Packit c4476c
Packit c4476c
	mov	$rounds,#10
Packit c4476c
	b	.Ldone
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.L192:
Packit c4476c
	vld1.8	{$in1},[$inp],#8
Packit c4476c
	vmov.i8	$key,#8			// borrow $key
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
	vsub.i8	$mask,$mask,$key	// adjust the mask
Packit c4476c
Packit c4476c
.Loop192:
Packit c4476c
	vtbl.8	$key,{$in1},$mask
Packit c4476c
	vext.8	$tmp,$zero,$in0,#12
Packit c4476c
	vst1.32	{$in1},[$out],#8
Packit c4476c
	aese	$key,$zero
Packit c4476c
	subs	$bits,$bits,#1
Packit c4476c
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
Packit c4476c
	vdup.32	$tmp,${in0}[3]
Packit c4476c
	veor	$tmp,$tmp,$in1
Packit c4476c
	 veor	$key,$key,$rcon
Packit c4476c
	vext.8	$in1,$zero,$in1,#12
Packit c4476c
	vshl.u8	$rcon,$rcon,#1
Packit c4476c
	veor	$in1,$in1,$tmp
Packit c4476c
	veor	$in0,$in0,$key
Packit c4476c
	veor	$in1,$in1,$key
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
	b.ne	.Loop192
Packit c4476c
Packit c4476c
	mov	$rounds,#12
Packit c4476c
	add	$out,$out,#0x20
Packit c4476c
	b	.Ldone
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.L256:
Packit c4476c
	vld1.8	{$in1},[$inp]
Packit c4476c
	mov	$bits,#7
Packit c4476c
	mov	$rounds,#14
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
Packit c4476c
.Loop256:
Packit c4476c
	vtbl.8	$key,{$in1},$mask
Packit c4476c
	vext.8	$tmp,$zero,$in0,#12
Packit c4476c
	vst1.32	{$in1},[$out],#16
Packit c4476c
	aese	$key,$zero
Packit c4476c
	subs	$bits,$bits,#1
Packit c4476c
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	 veor	$key,$key,$rcon
Packit c4476c
	veor	$in0,$in0,$tmp
Packit c4476c
	vshl.u8	$rcon,$rcon,#1
Packit c4476c
	veor	$in0,$in0,$key
Packit c4476c
	vst1.32	{$in0},[$out],#16
Packit c4476c
	b.eq	.Ldone
Packit c4476c
Packit c4476c
	vdup.32	$key,${in0}[3]		// just splat
Packit c4476c
	vext.8	$tmp,$zero,$in1,#12
Packit c4476c
	aese	$key,$zero
Packit c4476c
Packit c4476c
	veor	$in1,$in1,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in1,$in1,$tmp
Packit c4476c
	vext.8	$tmp,$zero,$tmp,#12
Packit c4476c
	veor	$in1,$in1,$tmp
Packit c4476c
Packit c4476c
	veor	$in1,$in1,$key
Packit c4476c
	b	.Loop256
Packit c4476c
Packit c4476c
.Ldone:
Packit c4476c
	str	$rounds,[$out]
Packit c4476c
	mov	$ptr,#0
Packit c4476c
Packit c4476c
.Lenc_key_abort:
Packit c4476c
	mov	x0,$ptr			// return value
Packit c4476c
	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
Packit c4476c
	ret
Packit c4476c
.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
Packit c4476c
Packit c4476c
.globl	${prefix}_set_decrypt_key
Packit c4476c
.type	${prefix}_set_decrypt_key,%function
Packit c4476c
.align	5
Packit c4476c
${prefix}_set_decrypt_key:
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	.inst	0xd503233f		// paciasp
Packit c4476c
	stp	x29,x30,[sp,#-16]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour !~ /64/);
Packit c4476c
	stmdb	sp!,{r4,lr}
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	bl	.Lenc_key
Packit c4476c
Packit c4476c
	cmp	x0,#0
Packit c4476c
	b.ne	.Ldec_key_abort
Packit c4476c
Packit c4476c
	sub	$out,$out,#240		// restore original $out
Packit c4476c
	mov	x4,#-16
Packit c4476c
	add	$inp,$out,x12,lsl#4	// end of key schedule
Packit c4476c
Packit c4476c
	vld1.32	{v0.16b},[$out]
Packit c4476c
	vld1.32	{v1.16b},[$inp]
Packit c4476c
	vst1.32	{v0.16b},[$inp],x4
Packit c4476c
	vst1.32	{v1.16b},[$out],#16
Packit c4476c
Packit c4476c
.Loop_imc:
Packit c4476c
	vld1.32	{v0.16b},[$out]
Packit c4476c
	vld1.32	{v1.16b},[$inp]
Packit c4476c
	aesimc	v0.16b,v0.16b
Packit c4476c
	aesimc	v1.16b,v1.16b
Packit c4476c
	vst1.32	{v0.16b},[$inp],x4
Packit c4476c
	vst1.32	{v1.16b},[$out],#16
Packit c4476c
	cmp	$inp,$out
Packit c4476c
	b.hi	.Loop_imc
Packit c4476c
Packit c4476c
	vld1.32	{v0.16b},[$out]
Packit c4476c
	aesimc	v0.16b,v0.16b
Packit c4476c
	vst1.32	{v0.16b},[$inp]
Packit c4476c
Packit c4476c
	eor	x0,x0,x0		// return value
Packit c4476c
.Ldec_key_abort:
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour !~ /64/);
Packit c4476c
	ldmia	sp!,{r4,pc}
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	ldp	x29,x30,[sp],#16
Packit c4476c
	.inst	0xd50323bf		// autiasp
Packit c4476c
	ret
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
Packit c4476c
___
Packit c4476c
}}}
Packit c4476c
{{{
Packit c4476c
sub gen_block () {
Packit c4476c
my $dir = shift;
Packit c4476c
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
Packit c4476c
my ($inp,$out,$key)=map("x$_",(0..2));
Packit c4476c
my $rounds="w3";
Packit c4476c
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	${prefix}_${dir}crypt
Packit c4476c
.type	${prefix}_${dir}crypt,%function
Packit c4476c
.align	5
Packit c4476c
${prefix}_${dir}crypt:
Packit c4476c
	ldr	$rounds,[$key,#240]
Packit c4476c
	vld1.32	{$rndkey0},[$key],#16
Packit c4476c
	vld1.8	{$inout},[$inp]
Packit c4476c
	sub	$rounds,$rounds,#2
Packit c4476c
	vld1.32	{$rndkey1},[$key],#16
Packit c4476c
Packit c4476c
.Loop_${dir}c:
Packit c4476c
	aes$e	$inout,$rndkey0
Packit c4476c
	aes$mc	$inout,$inout
Packit c4476c
	vld1.32	{$rndkey0},[$key],#16
Packit c4476c
	subs	$rounds,$rounds,#2
Packit c4476c
	aes$e	$inout,$rndkey1
Packit c4476c
	aes$mc	$inout,$inout
Packit c4476c
	vld1.32	{$rndkey1},[$key],#16
Packit c4476c
	b.gt	.Loop_${dir}c
Packit c4476c
Packit c4476c
	aes$e	$inout,$rndkey0
Packit c4476c
	aes$mc	$inout,$inout
Packit c4476c
	vld1.32	{$rndkey0},[$key]
Packit c4476c
	aes$e	$inout,$rndkey1
Packit c4476c
	veor	$inout,$inout,$rndkey0
Packit c4476c
Packit c4476c
	vst1.8	{$inout},[$out]
Packit c4476c
	ret
Packit c4476c
.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
Packit c4476c
___
Packit c4476c
}
Packit c4476c
&gen_block("en");
Packit c4476c
&gen_block("de");
Packit c4476c
}}}
Packit c4476c
{{{
Packit c4476c
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
Packit c4476c
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
Packit c4476c
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
Packit c4476c
Packit c4476c
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
Packit c4476c
my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
Packit c4476c
Packit c4476c
### q8-q15	preloaded key schedule
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	${prefix}_cbc_encrypt
Packit c4476c
.type	${prefix}_cbc_encrypt,%function
Packit c4476c
.align	5
Packit c4476c
${prefix}_cbc_encrypt:
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	stp	x29,x30,[sp,#-16]!
Packit c4476c
	add	x29,sp,#0
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour !~ /64/);
Packit c4476c
	mov	ip,sp
Packit c4476c
	stmdb	sp!,{r4-r8,lr}
Packit c4476c
	vstmdb	sp!,{d8-d15}            @ ABI specification says so
Packit c4476c
	ldmia	ip,{r4-r5}		@ load remaining args
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	subs	$len,$len,#16
Packit c4476c
	mov	$step,#16
Packit c4476c
	b.lo	.Lcbc_abort
Packit c4476c
	cclr	$step,eq
Packit c4476c
Packit c4476c
	cmp	$enc,#0			// en- or decrypting?
Packit c4476c
	ldr	$rounds,[$key,#240]
Packit c4476c
	and	$len,$len,#-16
Packit c4476c
	vld1.8	{$ivec},[$ivp]
Packit c4476c
	vld1.8	{$dat},[$inp],$step
Packit c4476c
Packit c4476c
	vld1.32	{q8-q9},[$key]		// load key schedule...
Packit c4476c
	sub	$rounds,$rounds,#6
Packit c4476c
	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
Packit c4476c
	sub	$rounds,$rounds,#2
Packit c4476c
	vld1.32	{q10-q11},[$key_],#32
Packit c4476c
	vld1.32	{q12-q13},[$key_],#32
Packit c4476c
	vld1.32	{q14-q15},[$key_],#32
Packit c4476c
	vld1.32	{$rndlast},[$key_]
Packit c4476c
Packit c4476c
	add	$key_,$key,#32
Packit c4476c
	mov	$cnt,$rounds
Packit c4476c
	b.eq	.Lcbc_dec
Packit c4476c
Packit c4476c
	cmp	$rounds,#2
Packit c4476c
	veor	$dat,$dat,$ivec
Packit c4476c
	veor	$rndzero_n_last,q8,$rndlast
Packit c4476c
	b.eq	.Lcbc_enc128
Packit c4476c
Packit c4476c
	vld1.32	{$in0-$in1},[$key_]
Packit c4476c
	add	$key_,$key,#16
Packit c4476c
	add	$key4,$key,#16*4
Packit c4476c
	add	$key5,$key,#16*5
Packit c4476c
	aese	$dat,q8
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	add	$key6,$key,#16*6
Packit c4476c
	add	$key7,$key,#16*7
Packit c4476c
	b	.Lenter_cbc_enc
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_cbc_enc:
Packit c4476c
	aese	$dat,q8
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 vst1.8	{$ivec},[$out],#16
Packit c4476c
.Lenter_cbc_enc:
Packit c4476c
	aese	$dat,q9
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,$in0
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	vld1.32	{q8},[$key4]
Packit c4476c
	cmp	$rounds,#4
Packit c4476c
	aese	$dat,$in1
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	vld1.32	{q9},[$key5]
Packit c4476c
	b.eq	.Lcbc_enc192
Packit c4476c
Packit c4476c
	aese	$dat,q8
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	vld1.32	{q8},[$key6]
Packit c4476c
	aese	$dat,q9
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	vld1.32	{q9},[$key7]
Packit c4476c
	nop
Packit c4476c
Packit c4476c
.Lcbc_enc192:
Packit c4476c
	aese	$dat,q8
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 subs	$len,$len,#16
Packit c4476c
	aese	$dat,q9
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 cclr	$step,eq
Packit c4476c
	aese	$dat,q10
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,q11
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 vld1.8	{q8},[$inp],$step
Packit c4476c
	aese	$dat,q12
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 veor	q8,q8,$rndzero_n_last
Packit c4476c
	aese	$dat,q13
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
Packit c4476c
	aese	$dat,q14
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,q15
Packit c4476c
	veor	$ivec,$dat,$rndlast
Packit c4476c
	b.hs	.Loop_cbc_enc
Packit c4476c
Packit c4476c
	vst1.8	{$ivec},[$out],#16
Packit c4476c
	b	.Lcbc_done
Packit c4476c
Packit c4476c
.align	5
Packit c4476c
.Lcbc_enc128:
Packit c4476c
	vld1.32	{$in0-$in1},[$key_]
Packit c4476c
	aese	$dat,q8
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	b	.Lenter_cbc_enc128
Packit c4476c
.Loop_cbc_enc128:
Packit c4476c
	aese	$dat,q8
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 vst1.8	{$ivec},[$out],#16
Packit c4476c
.Lenter_cbc_enc128:
Packit c4476c
	aese	$dat,q9
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 subs	$len,$len,#16
Packit c4476c
	aese	$dat,$in0
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 cclr	$step,eq
Packit c4476c
	aese	$dat,$in1
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,q10
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,q11
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 vld1.8	{q8},[$inp],$step
Packit c4476c
	aese	$dat,q12
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,q13
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	aese	$dat,q14
Packit c4476c
	aesmc	$dat,$dat
Packit c4476c
	 veor	q8,q8,$rndzero_n_last
Packit c4476c
	aese	$dat,q15
Packit c4476c
	veor	$ivec,$dat,$rndlast
Packit c4476c
	b.hs	.Loop_cbc_enc128
Packit c4476c
Packit c4476c
	vst1.8	{$ivec},[$out],#16
Packit c4476c
	b	.Lcbc_done
Packit c4476c
___
Packit c4476c
{
Packit c4476c
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
Packit c4476c
Packit c4476c
my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
Packit c4476c
my ($dat4,$in4,$tmp4);
Packit c4476c
if ($flavour =~ /64/) {
Packit c4476c
    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
Packit c4476c
}
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.align	5
Packit c4476c
.Lcbc_dec:
Packit c4476c
	vld1.8	{$dat2},[$inp],#16
Packit c4476c
	subs	$len,$len,#32		// bias
Packit c4476c
	add	$cnt,$rounds,#2
Packit c4476c
	vorr	$in1,$dat,$dat
Packit c4476c
	vorr	$dat1,$dat,$dat
Packit c4476c
	vorr	$in2,$dat2,$dat2
Packit c4476c
	b.lo	.Lcbc_dec_tail
Packit c4476c
Packit c4476c
	vorr	$dat1,$dat2,$dat2
Packit c4476c
	vld1.8	{$dat2},[$inp],#16
Packit c4476c
	vorr	$in0,$dat,$dat
Packit c4476c
	vorr	$in1,$dat1,$dat1
Packit c4476c
	vorr	$in2,$dat2,$dat2
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	cmp	$len,#32
Packit c4476c
	b.lo	.Loop3x_cbc_dec
Packit c4476c
Packit c4476c
	vld1.8	{$dat3},[$inp],#16
Packit c4476c
	vld1.8	{$dat4},[$inp],#16
Packit c4476c
	sub	$len,$len,#32		// bias
Packit c4476c
	mov	$cnt,$rounds
Packit c4476c
	vorr	$in3,$dat3,$dat3
Packit c4476c
	vorr	$in4,$dat4,$dat4
Packit c4476c
Packit c4476c
.Loop5x_cbc_dec:
Packit c4476c
	aesd	$dat0,q8
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q8
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q8
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q8
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q8
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
	vld1.32	{q8},[$key_],#16
Packit c4476c
	subs	$cnt,$cnt,#2
Packit c4476c
	aesd	$dat0,q9
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q9
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q9
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q9
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q9
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
	vld1.32	{q9},[$key_],#16
Packit c4476c
	b.gt	.Loop5x_cbc_dec
Packit c4476c
Packit c4476c
	aesd	$dat0,q8
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q8
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q8
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q8
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q8
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
	 cmp	$len,#0x40		// because .Lcbc_tail4x
Packit c4476c
	 sub	$len,$len,#0x50
Packit c4476c
Packit c4476c
	aesd	$dat0,q9
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q9
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q9
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q9
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q9
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
	 csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
Packit c4476c
	 mov	$key_,$key
Packit c4476c
Packit c4476c
	aesd	$dat0,q10
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q10
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q10
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q10
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q10
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
	 add	$inp,$inp,x6		// $inp is adjusted in such way that
Packit c4476c
					// at exit from the loop $dat1-$dat4
Packit c4476c
					// are loaded with last "words"
Packit c4476c
	 add	x6,$len,#0x60		// because .Lcbc_tail4x
Packit c4476c
Packit c4476c
	aesd	$dat0,q11
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q11
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q11
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q11
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q11
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
Packit c4476c
	aesd	$dat0,q12
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q12
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q12
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q12
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q12
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
Packit c4476c
	aesd	$dat0,q13
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q13
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q13
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q13
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q13
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
Packit c4476c
	aesd	$dat0,q14
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q14
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q14
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat3,q14
Packit c4476c
	aesimc	$dat3,$dat3
Packit c4476c
	aesd	$dat4,q14
Packit c4476c
	aesimc	$dat4,$dat4
Packit c4476c
Packit c4476c
	 veor	$tmp0,$ivec,$rndlast
Packit c4476c
	aesd	$dat0,q15
Packit c4476c
	 veor	$tmp1,$in0,$rndlast
Packit c4476c
	 vld1.8	{$in0},[$inp],#16
Packit c4476c
	aesd	$dat1,q15
Packit c4476c
	 veor	$tmp2,$in1,$rndlast
Packit c4476c
	 vld1.8	{$in1},[$inp],#16
Packit c4476c
	aesd	$dat2,q15
Packit c4476c
	 veor	$tmp3,$in2,$rndlast
Packit c4476c
	 vld1.8	{$in2},[$inp],#16
Packit c4476c
	aesd	$dat3,q15
Packit c4476c
	 veor	$tmp4,$in3,$rndlast
Packit c4476c
	 vld1.8	{$in3},[$inp],#16
Packit c4476c
	aesd	$dat4,q15
Packit c4476c
	 vorr	$ivec,$in4,$in4
Packit c4476c
	 vld1.8	{$in4},[$inp],#16
Packit c4476c
	cbz	x6,.Lcbc_tail4x
Packit c4476c
	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
Packit c4476c
	veor	$tmp0,$tmp0,$dat0
Packit c4476c
	 vorr	$dat0,$in0,$in0
Packit c4476c
	veor	$tmp1,$tmp1,$dat1
Packit c4476c
	 vorr	$dat1,$in1,$in1
Packit c4476c
	veor	$tmp2,$tmp2,$dat2
Packit c4476c
	 vorr	$dat2,$in2,$in2
Packit c4476c
	veor	$tmp3,$tmp3,$dat3
Packit c4476c
	 vorr	$dat3,$in3,$in3
Packit c4476c
	veor	$tmp4,$tmp4,$dat4
Packit c4476c
	vst1.8	{$tmp0},[$out],#16
Packit c4476c
	 vorr	$dat4,$in4,$in4
Packit c4476c
	vst1.8	{$tmp1},[$out],#16
Packit c4476c
	 mov	$cnt,$rounds
Packit c4476c
	vst1.8	{$tmp2},[$out],#16
Packit c4476c
	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
Packit c4476c
	vst1.8	{$tmp3},[$out],#16
Packit c4476c
	vst1.8	{$tmp4},[$out],#16
Packit c4476c
	b.hs	.Loop5x_cbc_dec
Packit c4476c
Packit c4476c
	add	$len,$len,#0x50
Packit c4476c
	cbz	$len,.Lcbc_done
Packit c4476c
Packit c4476c
	add	$cnt,$rounds,#2
Packit c4476c
	subs	$len,$len,#0x30
Packit c4476c
	vorr	$dat0,$in2,$in2
Packit c4476c
	vorr	$in0,$in2,$in2
Packit c4476c
	vorr	$dat1,$in3,$in3
Packit c4476c
	vorr	$in1,$in3,$in3
Packit c4476c
	vorr	$dat2,$in4,$in4
Packit c4476c
	vorr	$in2,$in4,$in4
Packit c4476c
	b.lo	.Lcbc_dec_tail
Packit c4476c
Packit c4476c
	b	.Loop3x_cbc_dec
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lcbc_tail4x:
Packit c4476c
	veor	$tmp1,$tmp0,$dat1
Packit c4476c
	veor	$tmp2,$tmp2,$dat2
Packit c4476c
	veor	$tmp3,$tmp3,$dat3
Packit c4476c
	veor	$tmp4,$tmp4,$dat4
Packit c4476c
	vst1.8	{$tmp1},[$out],#16
Packit c4476c
	vst1.8	{$tmp2},[$out],#16
Packit c4476c
	vst1.8	{$tmp3},[$out],#16
Packit c4476c
	vst1.8	{$tmp4},[$out],#16
Packit c4476c
Packit c4476c
	b	.Lcbc_done
Packit c4476c
.align	4
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
.Loop3x_cbc_dec:
Packit c4476c
	aesd	$dat0,q8
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q8
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q8
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	vld1.32	{q8},[$key_],#16
Packit c4476c
	subs	$cnt,$cnt,#2
Packit c4476c
	aesd	$dat0,q9
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q9
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q9
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	vld1.32	{q9},[$key_],#16
Packit c4476c
	b.gt	.Loop3x_cbc_dec
Packit c4476c
Packit c4476c
	aesd	$dat0,q8
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q8
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q8
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 veor	$tmp0,$ivec,$rndlast
Packit c4476c
	 subs	$len,$len,#0x30
Packit c4476c
	 veor	$tmp1,$in0,$rndlast
Packit c4476c
	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
Packit c4476c
	aesd	$dat0,q9
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q9
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q9
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 veor	$tmp2,$in1,$rndlast
Packit c4476c
	 add	$inp,$inp,x6		// $inp is adjusted in such way that
Packit c4476c
					// at exit from the loop $dat1-$dat2
Packit c4476c
					// are loaded with last "words"
Packit c4476c
	 vorr	$ivec,$in2,$in2
Packit c4476c
	 mov	$key_,$key
Packit c4476c
	aesd	$dat0,q12
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q12
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q12
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 vld1.8	{$in0},[$inp],#16
Packit c4476c
	aesd	$dat0,q13
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q13
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q13
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 vld1.8	{$in1},[$inp],#16
Packit c4476c
	aesd	$dat0,q14
Packit c4476c
	aesimc	$dat0,$dat0
Packit c4476c
	aesd	$dat1,q14
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q14
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 vld1.8	{$in2},[$inp],#16
Packit c4476c
	aesd	$dat0,q15
Packit c4476c
	aesd	$dat1,q15
Packit c4476c
	aesd	$dat2,q15
Packit c4476c
	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
Packit c4476c
	 add	$cnt,$rounds,#2
Packit c4476c
	veor	$tmp0,$tmp0,$dat0
Packit c4476c
	veor	$tmp1,$tmp1,$dat1
Packit c4476c
	veor	$dat2,$dat2,$tmp2
Packit c4476c
	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
Packit c4476c
	vst1.8	{$tmp0},[$out],#16
Packit c4476c
	 vorr	$dat0,$in0,$in0
Packit c4476c
	vst1.8	{$tmp1},[$out],#16
Packit c4476c
	 vorr	$dat1,$in1,$in1
Packit c4476c
	vst1.8	{$dat2},[$out],#16
Packit c4476c
	 vorr	$dat2,$in2,$in2
Packit c4476c
	b.hs	.Loop3x_cbc_dec
Packit c4476c
Packit c4476c
	cmn	$len,#0x30
Packit c4476c
	b.eq	.Lcbc_done
Packit c4476c
	nop
Packit c4476c
Packit c4476c
.Lcbc_dec_tail:
Packit c4476c
	aesd	$dat1,q8
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q8
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	vld1.32	{q8},[$key_],#16
Packit c4476c
	subs	$cnt,$cnt,#2
Packit c4476c
	aesd	$dat1,q9
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q9
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	vld1.32	{q9},[$key_],#16
Packit c4476c
	b.gt	.Lcbc_dec_tail
Packit c4476c
Packit c4476c
	aesd	$dat1,q8
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q8
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat1,q9
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q9
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	aesd	$dat1,q12
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q12
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 cmn	$len,#0x20
Packit c4476c
	aesd	$dat1,q13
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q13
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 veor	$tmp1,$ivec,$rndlast
Packit c4476c
	aesd	$dat1,q14
Packit c4476c
	aesimc	$dat1,$dat1
Packit c4476c
	aesd	$dat2,q14
Packit c4476c
	aesimc	$dat2,$dat2
Packit c4476c
	 veor	$tmp2,$in1,$rndlast
Packit c4476c
	aesd	$dat1,q15
Packit c4476c
	aesd	$dat2,q15
Packit c4476c
	b.eq	.Lcbc_dec_one
Packit c4476c
	veor	$tmp1,$tmp1,$dat1
Packit c4476c
	veor	$tmp2,$tmp2,$dat2
Packit c4476c
	 vorr	$ivec,$in2,$in2
Packit c4476c
	vst1.8	{$tmp1},[$out],#16
Packit c4476c
	vst1.8	{$tmp2},[$out],#16
Packit c4476c
	b	.Lcbc_done
Packit c4476c
Packit c4476c
.Lcbc_dec_one:
Packit c4476c
	veor	$tmp1,$tmp1,$dat2
Packit c4476c
	 vorr	$ivec,$in2,$in2
Packit c4476c
	vst1.8	{$tmp1},[$out],#16
Packit c4476c
Packit c4476c
.Lcbc_done:
Packit c4476c
	vst1.8	{$ivec},[$ivp]
Packit c4476c
.Lcbc_abort:
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___	if ($flavour !~ /64/);
Packit c4476c
	vldmia	sp!,{d8-d15}
Packit c4476c
	ldmia	sp!,{r4-r8,pc}
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	ldr	x29,[sp],#16
Packit c4476c
	ret
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
Packit c4476c
___
Packit c4476c
}}}
Packit c4476c
{{{
Packit c4476c
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
Packit c4476c
my ($rounds,$cnt,$key_)=("w5","w6","x7");
Packit c4476c
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
Packit c4476c
my $step="x12";		# aliases with $tctr2
Packit c4476c
Packit c4476c
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
Packit c4476c
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
Packit c4476c
Packit c4476c
# used only in 64-bit mode...
Packit c4476c
my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
Packit c4476c
Packit c4476c
my ($dat,$tmp)=($dat0,$tmp0);
Packit c4476c
Packit c4476c
### q8-q15	preloaded key schedule
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	${prefix}_ctr32_encrypt_blocks
Packit c4476c
.type	${prefix}_ctr32_encrypt_blocks,%function
Packit c4476c
.align	5
Packit c4476c
${prefix}_ctr32_encrypt_blocks:
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	stp		x29,x30,[sp,#-16]!
Packit c4476c
	add		x29,sp,#0
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour !~ /64/);
Packit c4476c
	mov		ip,sp
Packit c4476c
	stmdb		sp!,{r4-r10,lr}
Packit c4476c
	vstmdb		sp!,{d8-d15}            @ ABI specification says so
Packit c4476c
	ldr		r4, [ip]		@ load remaining arg
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	ldr		$rounds,[$key,#240]
Packit c4476c
Packit c4476c
	ldr		$ctr, [$ivp, #12]
Packit c4476c
	vld1.32		{$dat0},[$ivp]
Packit c4476c
Packit c4476c
	vld1.32		{q8-q9},[$key]		// load key schedule...
Packit c4476c
	sub		$rounds,$rounds,#4
Packit c4476c
	mov		$step,#16
Packit c4476c
	cmp		$len,#2
Packit c4476c
	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
Packit c4476c
	sub		$rounds,$rounds,#2
Packit c4476c
	vld1.32		{q12-q13},[$key_],#32
Packit c4476c
	vld1.32		{q14-q15},[$key_],#32
Packit c4476c
	vld1.32		{$rndlast},[$key_]
Packit c4476c
	add		$key_,$key,#32
Packit c4476c
	mov		$cnt,$rounds
Packit c4476c
	cclr		$step,lo
Packit c4476c
#ifndef __ARMEB__
Packit c4476c
	rev		$ctr, $ctr
Packit c4476c
#endif
Packit c4476c
	vorr		$dat1,$dat0,$dat0
Packit c4476c
	add		$tctr1, $ctr, #1
Packit c4476c
	vorr		$dat2,$dat0,$dat0
Packit c4476c
	add		$ctr, $ctr, #2
Packit c4476c
	vorr		$ivec,$dat0,$dat0
Packit c4476c
	rev		$tctr1, $tctr1
Packit c4476c
	vmov.32		${dat1}[3],$tctr1
Packit c4476c
	b.ls		.Lctr32_tail
Packit c4476c
	rev		$tctr2, $ctr
Packit c4476c
	sub		$len,$len,#3		// bias
Packit c4476c
	vmov.32		${dat2}[3],$tctr2
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	cmp		$len,#2
Packit c4476c
	b.lo		.Loop3x_ctr32
Packit c4476c
Packit c4476c
	add		w13,$ctr,#1
Packit c4476c
	add		w14,$ctr,#2
Packit c4476c
	vorr		$dat3,$dat0,$dat0
Packit c4476c
	rev		w13,w13
Packit c4476c
	vorr		$dat4,$dat0,$dat0
Packit c4476c
	rev		w14,w14
Packit c4476c
	vmov.32		${dat3}[3],w13
Packit c4476c
	sub		$len,$len,#2		// bias
Packit c4476c
	vmov.32		${dat4}[3],w14
Packit c4476c
	add		$ctr,$ctr,#2
Packit c4476c
	b		.Loop5x_ctr32
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop5x_ctr32:
Packit c4476c
	aese		$dat0,q8
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q8
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q8
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	aese		$dat3,q8
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	aese		$dat4,q8
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
	vld1.32		{q8},[$key_],#16
Packit c4476c
	subs		$cnt,$cnt,#2
Packit c4476c
	aese		$dat0,q9
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q9
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q9
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	aese		$dat3,q9
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	aese		$dat4,q9
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
	vld1.32		{q9},[$key_],#16
Packit c4476c
	b.gt		.Loop5x_ctr32
Packit c4476c
Packit c4476c
	mov		$key_,$key
Packit c4476c
	aese		$dat0,q8
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q8
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q8
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	aese		$dat3,q8
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	aese		$dat4,q8
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
	vld1.32	 	{q8},[$key_],#16	// re-pre-load rndkey[0]
Packit c4476c
Packit c4476c
	aese		$dat0,q9
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q9
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q9
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	aese		$dat3,q9
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	aese		$dat4,q9
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
	vld1.32	 	{q9},[$key_],#16	// re-pre-load rndkey[1]
Packit c4476c
Packit c4476c
	aese		$dat0,q12
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	 add		$tctr0,$ctr,#1
Packit c4476c
	 add		$tctr1,$ctr,#2
Packit c4476c
	aese		$dat1,q12
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	 add		$tctr2,$ctr,#3
Packit c4476c
	 add		w13,$ctr,#4
Packit c4476c
	aese		$dat2,q12
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	 add		w14,$ctr,#5
Packit c4476c
	 rev		$tctr0,$tctr0
Packit c4476c
	aese		$dat3,q12
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	 rev		$tctr1,$tctr1
Packit c4476c
	 rev		$tctr2,$tctr2
Packit c4476c
	aese		$dat4,q12
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
	 rev		w13,w13
Packit c4476c
	 rev		w14,w14
Packit c4476c
Packit c4476c
	aese		$dat0,q13
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q13
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q13
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	aese		$dat3,q13
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	aese		$dat4,q13
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
Packit c4476c
	aese		$dat0,q14
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	 vld1.8		{$in0},[$inp],#16
Packit c4476c
	aese		$dat1,q14
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	 vld1.8		{$in1},[$inp],#16
Packit c4476c
	aese		$dat2,q14
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	 vld1.8		{$in2},[$inp],#16
Packit c4476c
	aese		$dat3,q14
Packit c4476c
	aesmc		$dat3,$dat3
Packit c4476c
	 vld1.8		{$in3},[$inp],#16
Packit c4476c
	aese		$dat4,q14
Packit c4476c
	aesmc		$dat4,$dat4
Packit c4476c
	 vld1.8		{$in4},[$inp],#16
Packit c4476c
Packit c4476c
	aese		$dat0,q15
Packit c4476c
	 veor		$in0,$in0,$rndlast
Packit c4476c
	aese		$dat1,q15
Packit c4476c
	 veor		$in1,$in1,$rndlast
Packit c4476c
	aese		$dat2,q15
Packit c4476c
	 veor		$in2,$in2,$rndlast
Packit c4476c
	aese		$dat3,q15
Packit c4476c
	 veor		$in3,$in3,$rndlast
Packit c4476c
	aese		$dat4,q15
Packit c4476c
	 veor		$in4,$in4,$rndlast
Packit c4476c
Packit c4476c
	veor		$in0,$in0,$dat0
Packit c4476c
	 vorr		$dat0,$ivec,$ivec
Packit c4476c
	veor		$in1,$in1,$dat1
Packit c4476c
	 vorr		$dat1,$ivec,$ivec
Packit c4476c
	veor		$in2,$in2,$dat2
Packit c4476c
	 vorr		$dat2,$ivec,$ivec
Packit c4476c
	veor		$in3,$in3,$dat3
Packit c4476c
	 vorr		$dat3,$ivec,$ivec
Packit c4476c
	veor		$in4,$in4,$dat4
Packit c4476c
	 vorr		$dat4,$ivec,$ivec
Packit c4476c
Packit c4476c
	vst1.8		{$in0},[$out],#16
Packit c4476c
	 vmov.32	${dat0}[3],$tctr0
Packit c4476c
	vst1.8		{$in1},[$out],#16
Packit c4476c
	 vmov.32	${dat1}[3],$tctr1
Packit c4476c
	vst1.8		{$in2},[$out],#16
Packit c4476c
	 vmov.32	${dat2}[3],$tctr2
Packit c4476c
	vst1.8		{$in3},[$out],#16
Packit c4476c
	 vmov.32	${dat3}[3],w13
Packit c4476c
	vst1.8		{$in4},[$out],#16
Packit c4476c
	 vmov.32	${dat4}[3],w14
Packit c4476c
Packit c4476c
	mov		$cnt,$rounds
Packit c4476c
	cbz		$len,.Lctr32_done
Packit c4476c
Packit c4476c
	add		$ctr,$ctr,#5
Packit c4476c
	subs		$len,$len,#5
Packit c4476c
	b.hs		.Loop5x_ctr32
Packit c4476c
Packit c4476c
	add		$len,$len,#5
Packit c4476c
	sub		$ctr,$ctr,#5
Packit c4476c
Packit c4476c
	cmp		$len,#2
Packit c4476c
	mov		$step,#16
Packit c4476c
	cclr		$step,lo
Packit c4476c
	b.ls		.Lctr32_tail
Packit c4476c
Packit c4476c
	sub		$len,$len,#3		// bias
Packit c4476c
	add		$ctr,$ctr,#3
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	b		.Loop3x_ctr32
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop3x_ctr32:
Packit c4476c
	aese		$dat0,q8
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q8
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q8
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	vld1.32		{q8},[$key_],#16
Packit c4476c
	subs		$cnt,$cnt,#2
Packit c4476c
	aese		$dat0,q9
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q9
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat2,q9
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	vld1.32		{q9},[$key_],#16
Packit c4476c
	b.gt		.Loop3x_ctr32
Packit c4476c
Packit c4476c
	aese		$dat0,q8
Packit c4476c
	aesmc		$tmp0,$dat0
Packit c4476c
	aese		$dat1,q8
Packit c4476c
	aesmc		$tmp1,$dat1
Packit c4476c
	 vld1.8		{$in0},[$inp],#16
Packit c4476c
	 vorr		$dat0,$ivec,$ivec
Packit c4476c
	aese		$dat2,q8
Packit c4476c
	aesmc		$dat2,$dat2
Packit c4476c
	 vld1.8		{$in1},[$inp],#16
Packit c4476c
	 vorr		$dat1,$ivec,$ivec
Packit c4476c
	aese		$tmp0,q9
Packit c4476c
	aesmc		$tmp0,$tmp0
Packit c4476c
	aese		$tmp1,q9
Packit c4476c
	aesmc		$tmp1,$tmp1
Packit c4476c
	 vld1.8		{$in2},[$inp],#16
Packit c4476c
	 mov		$key_,$key
Packit c4476c
	aese		$dat2,q9
Packit c4476c
	aesmc		$tmp2,$dat2
Packit c4476c
	 vorr		$dat2,$ivec,$ivec
Packit c4476c
	 add		$tctr0,$ctr,#1
Packit c4476c
	aese		$tmp0,q12
Packit c4476c
	aesmc		$tmp0,$tmp0
Packit c4476c
	aese		$tmp1,q12
Packit c4476c
	aesmc		$tmp1,$tmp1
Packit c4476c
	 veor		$in0,$in0,$rndlast
Packit c4476c
	 add		$tctr1,$ctr,#2
Packit c4476c
	aese		$tmp2,q12
Packit c4476c
	aesmc		$tmp2,$tmp2
Packit c4476c
	 veor		$in1,$in1,$rndlast
Packit c4476c
	 add		$ctr,$ctr,#3
Packit c4476c
	aese		$tmp0,q13
Packit c4476c
	aesmc		$tmp0,$tmp0
Packit c4476c
	aese		$tmp1,q13
Packit c4476c
	aesmc		$tmp1,$tmp1
Packit c4476c
	 veor		$in2,$in2,$rndlast
Packit c4476c
	 rev		$tctr0,$tctr0
Packit c4476c
	aese		$tmp2,q13
Packit c4476c
	aesmc		$tmp2,$tmp2
Packit c4476c
	 vmov.32	${dat0}[3], $tctr0
Packit c4476c
	 rev		$tctr1,$tctr1
Packit c4476c
	aese		$tmp0,q14
Packit c4476c
	aesmc		$tmp0,$tmp0
Packit c4476c
	aese		$tmp1,q14
Packit c4476c
	aesmc		$tmp1,$tmp1
Packit c4476c
	 vmov.32	${dat1}[3], $tctr1
Packit c4476c
	 rev		$tctr2,$ctr
Packit c4476c
	aese		$tmp2,q14
Packit c4476c
	aesmc		$tmp2,$tmp2
Packit c4476c
	 vmov.32	${dat2}[3], $tctr2
Packit c4476c
	 subs		$len,$len,#3
Packit c4476c
	aese		$tmp0,q15
Packit c4476c
	aese		$tmp1,q15
Packit c4476c
	aese		$tmp2,q15
Packit c4476c
Packit c4476c
	veor		$in0,$in0,$tmp0
Packit c4476c
	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
Packit c4476c
	vst1.8		{$in0},[$out],#16
Packit c4476c
	veor		$in1,$in1,$tmp1
Packit c4476c
	 mov		$cnt,$rounds
Packit c4476c
	vst1.8		{$in1},[$out],#16
Packit c4476c
	veor		$in2,$in2,$tmp2
Packit c4476c
	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
Packit c4476c
	vst1.8		{$in2},[$out],#16
Packit c4476c
	b.hs		.Loop3x_ctr32
Packit c4476c
Packit c4476c
	adds		$len,$len,#3
Packit c4476c
	b.eq		.Lctr32_done
Packit c4476c
	cmp		$len,#1
Packit c4476c
	mov		$step,#16
Packit c4476c
	cclr		$step,eq
Packit c4476c
Packit c4476c
.Lctr32_tail:
Packit c4476c
	aese		$dat0,q8
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q8
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	vld1.32		{q8},[$key_],#16
Packit c4476c
	subs		$cnt,$cnt,#2
Packit c4476c
	aese		$dat0,q9
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q9
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	vld1.32		{q9},[$key_],#16
Packit c4476c
	b.gt		.Lctr32_tail
Packit c4476c
Packit c4476c
	aese		$dat0,q8
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q8
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	aese		$dat0,q9
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q9
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	 vld1.8		{$in0},[$inp],$step
Packit c4476c
	aese		$dat0,q12
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q12
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	 vld1.8		{$in1},[$inp]
Packit c4476c
	aese		$dat0,q13
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q13
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	 veor		$in0,$in0,$rndlast
Packit c4476c
	aese		$dat0,q14
Packit c4476c
	aesmc		$dat0,$dat0
Packit c4476c
	aese		$dat1,q14
Packit c4476c
	aesmc		$dat1,$dat1
Packit c4476c
	 veor		$in1,$in1,$rndlast
Packit c4476c
	aese		$dat0,q15
Packit c4476c
	aese		$dat1,q15
Packit c4476c
Packit c4476c
	cmp		$len,#1
Packit c4476c
	veor		$in0,$in0,$dat0
Packit c4476c
	veor		$in1,$in1,$dat1
Packit c4476c
	vst1.8		{$in0},[$out],#16
Packit c4476c
	b.eq		.Lctr32_done
Packit c4476c
	vst1.8		{$in1},[$out]
Packit c4476c
Packit c4476c
.Lctr32_done:
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour !~ /64/);
Packit c4476c
	vldmia		sp!,{d8-d15}
Packit c4476c
	ldmia		sp!,{r4-r10,pc}
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($flavour =~ /64/);
Packit c4476c
	ldr		x29,[sp],#16
Packit c4476c
	ret
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
Packit c4476c
___
Packit c4476c
}}}
Packit c4476c
$code.=<<___;
Packit c4476c
#endif
Packit c4476c
___
Packit c4476c
########################################
Packit c4476c
if ($flavour =~ /64/) {			######## 64-bit code
Packit c4476c
    my %opcode = (
Packit c4476c
	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
Packit c4476c
	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
Packit c4476c
Packit c4476c
    local *unaes = sub {
Packit c4476c
	my ($mnemonic,$arg)=@_;
Packit c4476c
Packit c4476c
	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
Packit c4476c
	sprintf ".inst\t0x%08x\t//%s %s",
Packit c4476c
			$opcode{$mnemonic}|$1|($2<<5),
Packit c4476c
			$mnemonic,$arg;
Packit c4476c
    };
Packit c4476c
Packit c4476c
    foreach(split("\n",$code)) {
Packit c4476c
	s/\`([^\`]*)\`/eval($1)/geo;
Packit c4476c
Packit c4476c
	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
Packit c4476c
	s/@\s/\/\//o;			# old->new style commentary
Packit c4476c
Packit c4476c
	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
Packit c4476c
	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
Packit c4476c
	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
Packit c4476c
	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
Packit c4476c
	s/vext\.8/ext/o		or
Packit c4476c
	s/vrev32\.8/rev32/o	or
Packit c4476c
	s/vtst\.8/cmtst/o	or
Packit c4476c
	s/vshr/ushr/o		or
Packit c4476c
	s/^(\s+)v/$1/o		or	# strip off v prefix
Packit c4476c
	s/\bbx\s+lr\b/ret/o;
Packit c4476c
Packit c4476c
	# fix up remaining legacy suffixes
Packit c4476c
	s/\.[ui]?8//o;
Packit c4476c
	m/\],#8/o and s/\.16b/\.8b/go;
Packit c4476c
	s/\.[ui]?32//o and s/\.16b/\.4s/go;
Packit c4476c
	s/\.[ui]?64//o and s/\.16b/\.2d/go;
Packit c4476c
	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
Packit c4476c
Packit c4476c
	print $_,"\n";
Packit c4476c
    }
Packit c4476c
} else {				######## 32-bit code
Packit c4476c
    my %opcode = (
Packit c4476c
	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
Packit c4476c
	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
Packit c4476c
Packit c4476c
    local *unaes = sub {
Packit c4476c
	my ($mnemonic,$arg)=@_;
Packit c4476c
Packit c4476c
	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
Packit c4476c
	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
Packit c4476c
					 |(($2&7)<<1) |(($2&8)<<2);
Packit c4476c
	    # since ARMv7 instructions are always encoded little-endian.
Packit c4476c
	    # correct solution is to use .inst directive, but older
Packit c4476c
	    # assemblers don't implement it:-(
Packit c4476c
	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
Packit c4476c
			$word&0xff,($word>>8)&0xff,
Packit c4476c
			($word>>16)&0xff,($word>>24)&0xff,
Packit c4476c
			$mnemonic,$arg;
Packit c4476c
	}
Packit c4476c
    };
Packit c4476c
Packit c4476c
    sub unvtbl {
Packit c4476c
	my $arg=shift;
Packit c4476c
Packit c4476c
	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
Packit c4476c
	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
Packit c4476c
		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    sub unvdup32 {
Packit c4476c
	my $arg=shift;
Packit c4476c
Packit c4476c
	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
Packit c4476c
	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    sub unvmov32 {
Packit c4476c
	my $arg=shift;
Packit c4476c
Packit c4476c
	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
Packit c4476c
	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
Packit c4476c
    }
Packit c4476c
Packit c4476c
    foreach(split("\n",$code)) {
Packit c4476c
	s/\`([^\`]*)\`/eval($1)/geo;
Packit c4476c
Packit c4476c
	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
Packit c4476c
	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
Packit c4476c
	s/\/\/\s?/@ /o;				# new->old style commentary
Packit c4476c
Packit c4476c
	# fix up remaining new-style suffixes
Packit c4476c
	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
Packit c4476c
	s/\],#[0-9]+/]!/o;
Packit c4476c
Packit c4476c
	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
Packit c4476c
	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o	or
Packit c4476c
	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
Packit c4476c
	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
Packit c4476c
	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
Packit c4476c
	s/^(\s+)b\./$1b/o				or
Packit c4476c
	s/^(\s+)ret/$1bx\tlr/o;
Packit c4476c
Packit c4476c
	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
Packit c4476c
	    print "	it	$2\n";
Packit c4476c
	}
Packit c4476c
Packit c4476c
	print $_,"\n";
Packit c4476c
    }
Packit c4476c
}
Packit c4476c
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";