Blame crypto/chacha/asm/chacha-armv4.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
#
Packit c4476c
# December 2014
Packit c4476c
#
Packit c4476c
# ChaCha20 for ARMv4.
Packit c4476c
#
Packit c4476c
# Performance in cycles per byte out of large buffer.
Packit c4476c
#
Packit c4476c
#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
Packit c4476c
#
Packit c4476c
# Cortex-A5		19.3(*)/+95%    21.8        14.1
Packit c4476c
# Cortex-A8		10.5(*)/+160%   13.9        6.35
Packit c4476c
# Cortex-A9		12.9(**)/+110%  14.3        6.50
Packit c4476c
# Cortex-A15		11.0/+40%       16.0        5.00
Packit c4476c
# Snapdragon S4		11.5/+125%      13.6        4.90
Packit c4476c
#
Packit c4476c
# (*)	most "favourable" result for aligned data on little-endian
Packit c4476c
#	processor, result for misaligned data is 10-15% lower;
Packit c4476c
# (**)	this result is a trade-off: it can be improved by 20%,
Packit c4476c
#	but then Snapdragon S4 and Cortex-A8 results get
Packit c4476c
#	20-25% worse;
Packit c4476c
Packit c4476c
$flavour = shift;
Packit c4476c
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
Packit c4476c
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Packit c4476c
Packit c4476c
if ($flavour && $flavour ne "void") {
Packit c4476c
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Packit c4476c
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
Packit c4476c
    die "can't locate arm-xlate.pl";
Packit c4476c
Packit c4476c
    open STDOUT,"| \"$^X\" $xlate $flavour $output";
Packit c4476c
} else {
Packit c4476c
    open STDOUT,">$output";
Packit c4476c
}
Packit c4476c
Packit c4476c
sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
Packit c4476c
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
Packit c4476c
  my $arg = pop;
Packit c4476c
    $arg = "#$arg" if ($arg*1 eq $arg);
Packit c4476c
    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
Packit c4476c
}
Packit c4476c
Packit c4476c
my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
Packit c4476c
my @t=map("r$_",(8..11));
Packit c4476c
Packit c4476c
sub ROUND {
Packit c4476c
my ($a0,$b0,$c0,$d0)=@_;
Packit c4476c
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
Packit c4476c
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
Packit c4476c
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
Packit c4476c
my $odd = $d0&1;
Packit c4476c
my ($xc,$xc_) = (@t[0..1]);
Packit c4476c
my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
Packit c4476c
my @ret;
Packit c4476c
Packit c4476c
	# Consider order in which variables are addressed by their
Packit c4476c
	# index:
Packit c4476c
	#
Packit c4476c
	#       a   b   c   d
Packit c4476c
	#
Packit c4476c
	#       0   4   8  12 < even round
Packit c4476c
	#       1   5   9  13
Packit c4476c
	#       2   6  10  14
Packit c4476c
	#       3   7  11  15
Packit c4476c
	#       0   5  10  15 < odd round
Packit c4476c
	#       1   6  11  12
Packit c4476c
	#       2   7   8  13
Packit c4476c
	#       3   4   9  14
Packit c4476c
	#
Packit c4476c
	# 'a', 'b' are permanently allocated in registers, @x[0..7],
Packit c4476c
	# while 'c's and pair of 'd's are maintained in memory. If
Packit c4476c
	# you observe 'c' column, you'll notice that pair of 'c's is
Packit c4476c
	# invariant between rounds. This means that we have to reload
Packit c4476c
	# them once per round, in the middle. This is why you'll see
Packit c4476c
	# bunch of 'c' stores and loads in the middle, but none in
Packit c4476c
	# the beginning or end. If you observe 'd' column, you'll
Packit c4476c
	# notice that 15 and 13 are reused in next pair of rounds.
Packit c4476c
	# This is why these two are chosen for offloading to memory,
Packit c4476c
	# to make loads count more.
Packit c4476c
							push @ret,(
Packit c4476c
	"&add	(@x[$a0],@x[$a0],@x[$b0])",
Packit c4476c
	"&mov	($xd,$xd,'ror#16')",
Packit c4476c
	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
Packit c4476c
	 "&mov	($xd_,$xd_,'ror#16')",
Packit c4476c
	"&eor	($xd,$xd,@x[$a0],'ror#16')",
Packit c4476c
	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
Packit c4476c
Packit c4476c
	"&add	($xc,$xc,$xd)",
Packit c4476c
	"&mov	(@x[$b0],@x[$b0],'ror#20')",
Packit c4476c
	 "&add	($xc_,$xc_,$xd_)",
Packit c4476c
	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
Packit c4476c
	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
Packit c4476c
	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
Packit c4476c
Packit c4476c
	"&add	(@x[$a0],@x[$a0],@x[$b0])",
Packit c4476c
	"&mov	($xd,$xd,'ror#24')",
Packit c4476c
	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
Packit c4476c
	 "&mov	($xd_,$xd_,'ror#24')",
Packit c4476c
	"&eor	($xd,$xd,@x[$a0],'ror#24')",
Packit c4476c
	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
Packit c4476c
Packit c4476c
	"&add	($xc,$xc,$xd)",
Packit c4476c
	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
Packit c4476c
							push @ret,(
Packit c4476c
	"&str	($xd,'[sp,#4*(16+$d0)]')",
Packit c4476c
	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
Packit c4476c
							push @ret,(
Packit c4476c
	 "&add	($xc_,$xc_,$xd_)",
Packit c4476c
	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
Packit c4476c
							push @ret,(
Packit c4476c
	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
Packit c4476c
	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
Packit c4476c
							push @ret,(
Packit c4476c
	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
Packit c4476c
	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
Packit c4476c
Packit c4476c
	$xd=@x[$d2]					if (!$odd);
Packit c4476c
	$xd_=@x[$d3]					if ($odd);
Packit c4476c
							push @ret,(
Packit c4476c
	"&str	($xc,'[sp,#4*(16+$c0)]')",
Packit c4476c
	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
Packit c4476c
	"&add	(@x[$a2],@x[$a2],@x[$b2])",
Packit c4476c
	"&mov	($xd,$xd,'ror#16')",
Packit c4476c
	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
Packit c4476c
	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
Packit c4476c
	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
Packit c4476c
	 "&mov	($xd_,$xd_,'ror#16')",
Packit c4476c
	"&eor	($xd,$xd,@x[$a2],'ror#16')",
Packit c4476c
	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
Packit c4476c
Packit c4476c
	"&add	($xc,$xc,$xd)",
Packit c4476c
	"&mov	(@x[$b2],@x[$b2],'ror#20')",
Packit c4476c
	 "&add	($xc_,$xc_,$xd_)",
Packit c4476c
	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
Packit c4476c
	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
Packit c4476c
	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
Packit c4476c
Packit c4476c
	"&add	(@x[$a2],@x[$a2],@x[$b2])",
Packit c4476c
	"&mov	($xd,$xd,'ror#24')",
Packit c4476c
	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
Packit c4476c
	 "&mov	($xd_,$xd_,'ror#24')",
Packit c4476c
	"&eor	($xd,$xd,@x[$a2],'ror#24')",
Packit c4476c
	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
Packit c4476c
Packit c4476c
	"&add	($xc,$xc,$xd)",
Packit c4476c
	"&mov	(@x[$b2],@x[$b2],'ror#25')",
Packit c4476c
	 "&add	($xc_,$xc_,$xd_)",
Packit c4476c
	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
Packit c4476c
	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
Packit c4476c
	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
Packit c4476c
Packit c4476c
	@ret;
Packit c4476c
}
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
#include "arm_arch.h"
Packit c4476c
Packit c4476c
.text
Packit c4476c
#if defined(__thumb2__) || defined(__clang__)
Packit c4476c
.syntax	unified
Packit c4476c
#endif
Packit c4476c
#if defined(__thumb2__)
Packit c4476c
.thumb
Packit c4476c
#else
Packit c4476c
.code	32
Packit c4476c
#endif
Packit c4476c
Packit c4476c
#if defined(__thumb2__) || defined(__clang__)
Packit c4476c
#define ldrhsb	ldrbhs
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.align	5
Packit c4476c
.Lsigma:
Packit c4476c
.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
Packit c4476c
.Lone:
Packit c4476c
.long	1,0,0,0
Packit c4476c
#if __ARM_MAX_ARCH__>=7
Packit c4476c
.LOPENSSL_armcap:
Packit c4476c
.word   OPENSSL_armcap_P-.LChaCha20_ctr32
Packit c4476c
#else
Packit c4476c
.word	-1
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.globl	ChaCha20_ctr32
Packit c4476c
.type	ChaCha20_ctr32,%function
Packit c4476c
.align	5
Packit c4476c
ChaCha20_ctr32:
Packit c4476c
.LChaCha20_ctr32:
Packit c4476c
	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
Packit c4476c
	stmdb	sp!,{r0-r2,r4-r11,lr}
Packit c4476c
#if __ARM_ARCH__<7 && !defined(__thumb2__)
Packit c4476c
	sub	r14,pc,#16		@ ChaCha20_ctr32
Packit c4476c
#else
Packit c4476c
	adr	r14,.LChaCha20_ctr32
Packit c4476c
#endif
Packit c4476c
	cmp	r2,#0			@ len==0?
Packit c4476c
#ifdef	__thumb2__
Packit c4476c
	itt	eq
Packit c4476c
#endif
Packit c4476c
	addeq	sp,sp,#4*3
Packit c4476c
	beq	.Lno_data
Packit c4476c
#if __ARM_MAX_ARCH__>=7
Packit c4476c
	cmp	r2,#192			@ test len
Packit c4476c
	bls	.Lshort
Packit c4476c
	ldr	r4,[r14,#-32]
Packit c4476c
	ldr	r4,[r14,r4]
Packit c4476c
# ifdef	__APPLE__
Packit c4476c
	ldr	r4,[r4]
Packit c4476c
# endif
Packit c4476c
	tst	r4,#ARMV7_NEON
Packit c4476c
	bne	.LChaCha20_neon
Packit c4476c
.Lshort:
Packit c4476c
#endif
Packit c4476c
	ldmia	r12,{r4-r7}		@ load counter and nonce
Packit c4476c
	sub	sp,sp,#4*(16)		@ off-load area
Packit c4476c
	sub	r14,r14,#64		@ .Lsigma
Packit c4476c
	stmdb	sp!,{r4-r7}		@ copy counter and nonce
Packit c4476c
	ldmia	r3,{r4-r11}		@ load key
Packit c4476c
	ldmia	r14,{r0-r3}		@ load sigma
Packit c4476c
	stmdb	sp!,{r4-r11}		@ copy key
Packit c4476c
	stmdb	sp!,{r0-r3}		@ copy sigma
Packit c4476c
	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
Packit c4476c
	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
Packit c4476c
	b	.Loop_outer_enter
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_outer:
Packit c4476c
	ldmia	sp,{r0-r9}		@ load key material
Packit c4476c
	str	@t[3],[sp,#4*(32+2)]	@ save len
Packit c4476c
	str	r12,  [sp,#4*(32+1)]	@ save inp
Packit c4476c
	str	r14,  [sp,#4*(32+0)]	@ save out
Packit c4476c
.Loop_outer_enter:
Packit c4476c
	ldr	@t[3], [sp,#4*(15)]
Packit c4476c
	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
Packit c4476c
	ldr	@t[2], [sp,#4*(13)]
Packit c4476c
	ldr	@x[14],[sp,#4*(14)]
Packit c4476c
	str	@t[3], [sp,#4*(16+15)]
Packit c4476c
	mov	@t[3],#10
Packit c4476c
	b	.Loop
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop:
Packit c4476c
	subs	@t[3],@t[3],#1
Packit c4476c
___
Packit c4476c
	foreach (&ROUND(0, 4, 8,12)) { eval; }
Packit c4476c
	foreach (&ROUND(0, 5,10,15)) { eval; }
Packit c4476c
$code.=<<___;
Packit c4476c
	bne	.Loop
Packit c4476c
Packit c4476c
	ldr	@t[3],[sp,#4*(32+2)]	@ load len
Packit c4476c
Packit c4476c
	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
Packit c4476c
	str	@t[1], [sp,#4*(16+9)]
Packit c4476c
	str	@x[12],[sp,#4*(16+12)]
Packit c4476c
	str	@t[2], [sp,#4*(16+13)]
Packit c4476c
	str	@x[14],[sp,#4*(16+14)]
Packit c4476c
Packit c4476c
	@ at this point we have first half of 512-bit result in
Packit c4476c
	@ @x[0-7] and second half at sp+4*(16+8)
Packit c4476c
Packit c4476c
	cmp	@t[3],#64		@ done yet?
Packit c4476c
#ifdef	__thumb2__
Packit c4476c
	itete	lo
Packit c4476c
#endif
Packit c4476c
	addlo	r12,sp,#4*(0)		@ shortcut or ...
Packit c4476c
	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
Packit c4476c
	addlo	r14,sp,#4*(0)		@ shortcut or ...
Packit c4476c
	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
Packit c4476c
Packit c4476c
	ldr	@t[0],[sp,#4*(0)]	@ load key material
Packit c4476c
	ldr	@t[1],[sp,#4*(1)]
Packit c4476c
Packit c4476c
#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
Packit c4476c
# if __ARM_ARCH__<7
Packit c4476c
	orr	@t[2],r12,r14
Packit c4476c
	tst	@t[2],#3		@ are input and output aligned?
Packit c4476c
	ldr	@t[2],[sp,#4*(2)]
Packit c4476c
	bne	.Lunaligned
Packit c4476c
	cmp	@t[3],#64		@ restore flags
Packit c4476c
# else
Packit c4476c
	ldr	@t[2],[sp,#4*(2)]
Packit c4476c
# endif
Packit c4476c
	ldr	@t[3],[sp,#4*(3)]
Packit c4476c
Packit c4476c
	add	@x[0],@x[0],@t[0]	@ accumulate key material
Packit c4476c
	add	@x[1],@x[1],@t[1]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[0],[r12],#16		@ load input
Packit c4476c
	ldrhs	@t[1],[r12,#-12]
Packit c4476c
Packit c4476c
	add	@x[2],@x[2],@t[2]
Packit c4476c
	add	@x[3],@x[3],@t[3]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[2],[r12,#-8]
Packit c4476c
	ldrhs	@t[3],[r12,#-4]
Packit c4476c
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
Packit c4476c
	rev	@x[0],@x[0]
Packit c4476c
	rev	@x[1],@x[1]
Packit c4476c
	rev	@x[2],@x[2]
Packit c4476c
	rev	@x[3],@x[3]
Packit c4476c
# endif
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[0],@x[0],@t[0]	@ xor with input
Packit c4476c
	eorhs	@x[1],@x[1],@t[1]
Packit c4476c
	 add	@t[0],sp,#4*(4)
Packit c4476c
	str	@x[0],[r14],#16		@ store output
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[2],@x[2],@t[2]
Packit c4476c
	eorhs	@x[3],@x[3],@t[3]
Packit c4476c
	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
	str	@x[1],[r14,#-12]
Packit c4476c
	str	@x[2],[r14,#-8]
Packit c4476c
	str	@x[3],[r14,#-4]
Packit c4476c
Packit c4476c
	add	@x[4],@x[4],@t[0]	@ accumulate key material
Packit c4476c
	add	@x[5],@x[5],@t[1]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[0],[r12],#16		@ load input
Packit c4476c
	ldrhs	@t[1],[r12,#-12]
Packit c4476c
	add	@x[6],@x[6],@t[2]
Packit c4476c
	add	@x[7],@x[7],@t[3]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[2],[r12,#-8]
Packit c4476c
	ldrhs	@t[3],[r12,#-4]
Packit c4476c
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
Packit c4476c
	rev	@x[4],@x[4]
Packit c4476c
	rev	@x[5],@x[5]
Packit c4476c
	rev	@x[6],@x[6]
Packit c4476c
	rev	@x[7],@x[7]
Packit c4476c
# endif
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[4],@x[4],@t[0]
Packit c4476c
	eorhs	@x[5],@x[5],@t[1]
Packit c4476c
	 add	@t[0],sp,#4*(8)
Packit c4476c
	str	@x[4],[r14],#16		@ store output
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[6],@x[6],@t[2]
Packit c4476c
	eorhs	@x[7],@x[7],@t[3]
Packit c4476c
	str	@x[5],[r14,#-12]
Packit c4476c
	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
	str	@x[6],[r14,#-8]
Packit c4476c
	 add	@x[0],sp,#4*(16+8)
Packit c4476c
	str	@x[7],[r14,#-4]
Packit c4476c
Packit c4476c
	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
Packit c4476c
Packit c4476c
	add	@x[0],@x[0],@t[0]	@ accumulate key material
Packit c4476c
	add	@x[1],@x[1],@t[1]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[0],[r12],#16		@ load input
Packit c4476c
	ldrhs	@t[1],[r12,#-12]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hi
Packit c4476c
# endif
Packit c4476c
	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
Packit c4476c
	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
Packit c4476c
	add	@x[2],@x[2],@t[2]
Packit c4476c
	add	@x[3],@x[3],@t[3]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[2],[r12,#-8]
Packit c4476c
	ldrhs	@t[3],[r12,#-4]
Packit c4476c
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
Packit c4476c
	rev	@x[0],@x[0]
Packit c4476c
	rev	@x[1],@x[1]
Packit c4476c
	rev	@x[2],@x[2]
Packit c4476c
	rev	@x[3],@x[3]
Packit c4476c
# endif
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[0],@x[0],@t[0]
Packit c4476c
	eorhs	@x[1],@x[1],@t[1]
Packit c4476c
	 add	@t[0],sp,#4*(12)
Packit c4476c
	str	@x[0],[r14],#16		@ store output
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[2],@x[2],@t[2]
Packit c4476c
	eorhs	@x[3],@x[3],@t[3]
Packit c4476c
	str	@x[1],[r14,#-12]
Packit c4476c
	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
	str	@x[2],[r14,#-8]
Packit c4476c
	str	@x[3],[r14,#-4]
Packit c4476c
Packit c4476c
	add	@x[4],@x[4],@t[0]	@ accumulate key material
Packit c4476c
	add	@x[5],@x[5],@t[1]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hi
Packit c4476c
# endif
Packit c4476c
	 addhi	@t[0],@t[0],#1		@ next counter value
Packit c4476c
	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[0],[r12],#16		@ load input
Packit c4476c
	ldrhs	@t[1],[r12,#-12]
Packit c4476c
	add	@x[6],@x[6],@t[2]
Packit c4476c
	add	@x[7],@x[7],@t[3]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhs	@t[2],[r12,#-8]
Packit c4476c
	ldrhs	@t[3],[r12,#-4]
Packit c4476c
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
Packit c4476c
	rev	@x[4],@x[4]
Packit c4476c
	rev	@x[5],@x[5]
Packit c4476c
	rev	@x[6],@x[6]
Packit c4476c
	rev	@x[7],@x[7]
Packit c4476c
# endif
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[4],@x[4],@t[0]
Packit c4476c
	eorhs	@x[5],@x[5],@t[1]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	 it	ne
Packit c4476c
# endif
Packit c4476c
	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	eorhs	@x[6],@x[6],@t[2]
Packit c4476c
	eorhs	@x[7],@x[7],@t[3]
Packit c4476c
	str	@x[4],[r14],#16		@ store output
Packit c4476c
	str	@x[5],[r14,#-12]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	it	hs
Packit c4476c
# endif
Packit c4476c
	 subhs	@t[3],@t[0],#64		@ len-=64
Packit c4476c
	str	@x[6],[r14,#-8]
Packit c4476c
	str	@x[7],[r14,#-4]
Packit c4476c
	bhi	.Loop_outer
Packit c4476c
Packit c4476c
	beq	.Ldone
Packit c4476c
# if __ARM_ARCH__<7
Packit c4476c
	b	.Ltail
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lunaligned:				@ unaligned endian-neutral path
Packit c4476c
	cmp	@t[3],#64		@ restore flags
Packit c4476c
# endif
Packit c4476c
#endif
Packit c4476c
#if __ARM_ARCH__<7
Packit c4476c
	ldr	@t[3],[sp,#4*(3)]
Packit c4476c
___
Packit c4476c
for ($i=0;$i<16;$i+=4) {
Packit c4476c
my $j=$i&0x7;
Packit c4476c
Packit c4476c
$code.=<<___	if ($i==4);
Packit c4476c
	add	@x[0],sp,#4*(16+8)
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($i==8);
Packit c4476c
	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hi
Packit c4476c
# endif
Packit c4476c
	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
Packit c4476c
	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($i==12);
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hi
Packit c4476c
# endif
Packit c4476c
	addhi	@t[0],@t[0],#1			@ next counter value
Packit c4476c
	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
	add	@x[$j+1],@x[$j+1],@t[1]
Packit c4476c
	add	@x[$j+2],@x[$j+2],@t[2]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itete	lo
Packit c4476c
# endif
Packit c4476c
	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
Packit c4476c
	ldrhsb	@t[0],[r12],#16			@ ... load input
Packit c4476c
	eorlo	@t[1],@t[1],@t[1]
Packit c4476c
	ldrhsb	@t[1],[r12,#-12]
Packit c4476c
Packit c4476c
	add	@x[$j+3],@x[$j+3],@t[3]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itete	lo
Packit c4476c
# endif
Packit c4476c
	eorlo	@t[2],@t[2],@t[2]
Packit c4476c
	ldrhsb	@t[2],[r12,#-8]
Packit c4476c
	eorlo	@t[3],@t[3],@t[3]
Packit c4476c
	ldrhsb	@t[3],[r12,#-4]
Packit c4476c
Packit c4476c
	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
Packit c4476c
	eor	@x[$j+1],@t[1],@x[$j+1]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhsb	@t[0],[r12,#-15]		@ load more input
Packit c4476c
	ldrhsb	@t[1],[r12,#-11]
Packit c4476c
	eor	@x[$j+2],@t[2],@x[$j+2]
Packit c4476c
	 strb	@x[$j+0],[r14],#16		@ store output
Packit c4476c
	eor	@x[$j+3],@t[3],@x[$j+3]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhsb	@t[2],[r12,#-7]
Packit c4476c
	ldrhsb	@t[3],[r12,#-3]
Packit c4476c
	 strb	@x[$j+1],[r14,#-12]
Packit c4476c
	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
Packit c4476c
	 strb	@x[$j+2],[r14,#-8]
Packit c4476c
	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhsb	@t[0],[r12,#-14]		@ load more input
Packit c4476c
	ldrhsb	@t[1],[r12,#-10]
Packit c4476c
	 strb	@x[$j+3],[r14,#-4]
Packit c4476c
	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
Packit c4476c
	 strb	@x[$j+0],[r14,#-15]
Packit c4476c
	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhsb	@t[2],[r12,#-6]
Packit c4476c
	ldrhsb	@t[3],[r12,#-2]
Packit c4476c
	 strb	@x[$j+1],[r14,#-11]
Packit c4476c
	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
Packit c4476c
	 strb	@x[$j+2],[r14,#-7]
Packit c4476c
	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhsb	@t[0],[r12,#-13]		@ load more input
Packit c4476c
	ldrhsb	@t[1],[r12,#-9]
Packit c4476c
	 strb	@x[$j+3],[r14,#-3]
Packit c4476c
	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
Packit c4476c
	 strb	@x[$j+0],[r14,#-14]
Packit c4476c
	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	itt	hs
Packit c4476c
# endif
Packit c4476c
	ldrhsb	@t[2],[r12,#-5]
Packit c4476c
	ldrhsb	@t[3],[r12,#-1]
Packit c4476c
	 strb	@x[$j+1],[r14,#-10]
Packit c4476c
	 strb	@x[$j+2],[r14,#-6]
Packit c4476c
	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
Packit c4476c
	 strb	@x[$j+3],[r14,#-2]
Packit c4476c
	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
Packit c4476c
	 strb	@x[$j+0],[r14,#-13]
Packit c4476c
	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
Packit c4476c
	 strb	@x[$j+1],[r14,#-9]
Packit c4476c
	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
Packit c4476c
	 strb	@x[$j+2],[r14,#-5]
Packit c4476c
	 strb	@x[$j+3],[r14,#-1]
Packit c4476c
___
Packit c4476c
$code.=<<___	if ($i<12);
Packit c4476c
	add	@t[0],sp,#4*(4+$i)
Packit c4476c
	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
Packit c4476c
___
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	it	ne
Packit c4476c
# endif
Packit c4476c
	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	it	hs
Packit c4476c
# endif
Packit c4476c
	subhs	@t[3],@t[0],#64			@ len-=64
Packit c4476c
	bhi	.Loop_outer
Packit c4476c
Packit c4476c
	beq	.Ldone
Packit c4476c
#endif
Packit c4476c
Packit c4476c
.Ltail:
Packit c4476c
	ldr	r12,[sp,#4*(32+1)]	@ load inp
Packit c4476c
	add	@t[1],sp,#4*(0)
Packit c4476c
	ldr	r14,[sp,#4*(32+0)]	@ load out
Packit c4476c
Packit c4476c
.Loop_tail:
Packit c4476c
	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
Packit c4476c
	ldrb	@t[3],[r12],#1		@ read input
Packit c4476c
	subs	@t[0],@t[0],#1
Packit c4476c
	eor	@t[3],@t[3],@t[2]
Packit c4476c
	strb	@t[3],[r14],#1		@ store output
Packit c4476c
	bne	.Loop_tail
Packit c4476c
Packit c4476c
.Ldone:
Packit c4476c
	add	sp,sp,#4*(32+3)
Packit c4476c
.Lno_data:
Packit c4476c
	ldmia	sp!,{r4-r11,pc}
Packit c4476c
.size	ChaCha20_ctr32,.-ChaCha20_ctr32
Packit c4476c
___
Packit c4476c
Packit c4476c
{{{
Packit c4476c
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
Packit c4476c
    map("q$_",(0..15));
Packit c4476c
Packit c4476c
sub NEONROUND {
Packit c4476c
my $odd = pop;
Packit c4476c
my ($a,$b,$c,$d,$t)=@_;
Packit c4476c
Packit c4476c
	(
Packit c4476c
	"&vadd_i32	($a,$a,$b)",
Packit c4476c
	"&veor		($d,$d,$a)",
Packit c4476c
	"&vrev32_16	($d,$d)",	# vrot ($d,16)
Packit c4476c
Packit c4476c
	"&vadd_i32	($c,$c,$d)",
Packit c4476c
	"&veor		($t,$b,$c)",
Packit c4476c
	"&vshr_u32	($b,$t,20)",
Packit c4476c
	"&vsli_32	($b,$t,12)",
Packit c4476c
Packit c4476c
	"&vadd_i32	($a,$a,$b)",
Packit c4476c
	"&veor		($t,$d,$a)",
Packit c4476c
	"&vshr_u32	($d,$t,24)",
Packit c4476c
	"&vsli_32	($d,$t,8)",
Packit c4476c
Packit c4476c
	"&vadd_i32	($c,$c,$d)",
Packit c4476c
	"&veor		($t,$b,$c)",
Packit c4476c
	"&vshr_u32	($b,$t,25)",
Packit c4476c
	"&vsli_32	($b,$t,7)",
Packit c4476c
Packit c4476c
	"&vext_8	($c,$c,$c,8)",
Packit c4476c
	"&vext_8	($b,$b,$b,$odd?12:4)",
Packit c4476c
	"&vext_8	($d,$d,$d,$odd?4:12)"
Packit c4476c
	);
Packit c4476c
}
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
#if __ARM_MAX_ARCH__>=7
Packit c4476c
.arch	armv7-a
Packit c4476c
.fpu	neon
Packit c4476c
Packit c4476c
.type	ChaCha20_neon,%function
Packit c4476c
.align	5
Packit c4476c
ChaCha20_neon:
Packit c4476c
	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
Packit c4476c
	stmdb		sp!,{r0-r2,r4-r11,lr}
Packit c4476c
.LChaCha20_neon:
Packit c4476c
	adr		r14,.Lsigma
Packit c4476c
	vstmdb		sp!,{d8-d15}		@ ABI spec says so
Packit c4476c
	stmdb		sp!,{r0-r3}
Packit c4476c
Packit c4476c
	vld1.32		{$b0-$c0},[r3]		@ load key
Packit c4476c
	ldmia		r3,{r4-r11}		@ load key
Packit c4476c
Packit c4476c
	sub		sp,sp,#4*(16+16)
Packit c4476c
	vld1.32		{$d0},[r12]		@ load counter and nonce
Packit c4476c
	add		r12,sp,#4*8
Packit c4476c
	ldmia		r14,{r0-r3}		@ load sigma
Packit c4476c
	vld1.32		{$a0},[r14]!		@ load sigma
Packit c4476c
	vld1.32		{$t0},[r14]		@ one
Packit c4476c
	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
Packit c4476c
	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
Packit c4476c
Packit c4476c
	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
Packit c4476c
	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
Packit c4476c
	vshl.i32	$t1#lo,$t0#lo,#1	@ two
Packit c4476c
	vstr		$t0#lo,[sp,#4*(16+0)]
Packit c4476c
	vshl.i32	$t2#lo,$t0#lo,#2	@ four
Packit c4476c
	vstr		$t1#lo,[sp,#4*(16+2)]
Packit c4476c
	vmov		$a1,$a0
Packit c4476c
	vstr		$t2#lo,[sp,#4*(16+4)]
Packit c4476c
	vmov		$a2,$a0
Packit c4476c
	vmov		$b1,$b0
Packit c4476c
	vmov		$b2,$b0
Packit c4476c
	b		.Loop_neon_enter
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_neon_outer:
Packit c4476c
	ldmia		sp,{r0-r9}		@ load key material
Packit c4476c
	cmp		@t[3],#64*2		@ if len<=64*2
Packit c4476c
	bls		.Lbreak_neon		@ switch to integer-only
Packit c4476c
	vmov		$a1,$a0
Packit c4476c
	str		@t[3],[sp,#4*(32+2)]	@ save len
Packit c4476c
	vmov		$a2,$a0
Packit c4476c
	str		r12,  [sp,#4*(32+1)]	@ save inp
Packit c4476c
	vmov		$b1,$b0
Packit c4476c
	str		r14,  [sp,#4*(32+0)]	@ save out
Packit c4476c
	vmov		$b2,$b0
Packit c4476c
.Loop_neon_enter:
Packit c4476c
	ldr		@t[3], [sp,#4*(15)]
Packit c4476c
	vadd.i32	$d1,$d0,$t0		@ counter+1
Packit c4476c
	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
Packit c4476c
	vmov		$c1,$c0
Packit c4476c
	ldr		@t[2], [sp,#4*(13)]
Packit c4476c
	vmov		$c2,$c0
Packit c4476c
	ldr		@x[14],[sp,#4*(14)]
Packit c4476c
	vadd.i32	$d2,$d1,$t0		@ counter+2
Packit c4476c
	str		@t[3], [sp,#4*(16+15)]
Packit c4476c
	mov		@t[3],#10
Packit c4476c
	add		@x[12],@x[12],#3	@ counter+3
Packit c4476c
	b		.Loop_neon
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Loop_neon:
Packit c4476c
	subs		@t[3],@t[3],#1
Packit c4476c
___
Packit c4476c
	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
Packit c4476c
	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
Packit c4476c
	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
Packit c4476c
	my @thread3=&ROUND(0,4,8,12);
Packit c4476c
Packit c4476c
	foreach (@thread0) {
Packit c4476c
		eval;			eval(shift(@thread3));
Packit c4476c
		eval(shift(@thread1));	eval(shift(@thread3));
Packit c4476c
		eval(shift(@thread2));	eval(shift(@thread3));
Packit c4476c
	}
Packit c4476c
Packit c4476c
	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
Packit c4476c
	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
Packit c4476c
	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
Packit c4476c
	@thread3=&ROUND(0,5,10,15);
Packit c4476c
Packit c4476c
	foreach (@thread0) {
Packit c4476c
		eval;			eval(shift(@thread3));
Packit c4476c
		eval(shift(@thread1));	eval(shift(@thread3));
Packit c4476c
		eval(shift(@thread2));	eval(shift(@thread3));
Packit c4476c
	}
Packit c4476c
$code.=<<___;
Packit c4476c
	bne		.Loop_neon
Packit c4476c
Packit c4476c
	add		@t[3],sp,#32
Packit c4476c
	vld1.32		{$t0-$t1},[sp]		@ load key material
Packit c4476c
	vld1.32		{$t2-$t3},[@t[3]]
Packit c4476c
Packit c4476c
	ldr		@t[3],[sp,#4*(32+2)]	@ load len
Packit c4476c
Packit c4476c
	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
Packit c4476c
	str		@t[1], [sp,#4*(16+9)]
Packit c4476c
	str		@x[12],[sp,#4*(16+12)]
Packit c4476c
	str		@t[2], [sp,#4*(16+13)]
Packit c4476c
	str		@x[14],[sp,#4*(16+14)]
Packit c4476c
Packit c4476c
	@ at this point we have first half of 512-bit result in
Packit c4476c
	@ @x[0-7] and second half at sp+4*(16+8)
Packit c4476c
Packit c4476c
	ldr		r12,[sp,#4*(32+1)]	@ load inp
Packit c4476c
	ldr		r14,[sp,#4*(32+0)]	@ load out
Packit c4476c
Packit c4476c
	vadd.i32	$a0,$a0,$t0		@ accumulate key material
Packit c4476c
	vadd.i32	$a1,$a1,$t0
Packit c4476c
	vadd.i32	$a2,$a2,$t0
Packit c4476c
	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
Packit c4476c
Packit c4476c
	vadd.i32	$b0,$b0,$t1
Packit c4476c
	vadd.i32	$b1,$b1,$t1
Packit c4476c
	vadd.i32	$b2,$b2,$t1
Packit c4476c
	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
Packit c4476c
Packit c4476c
	vadd.i32	$c0,$c0,$t2
Packit c4476c
	vadd.i32	$c1,$c1,$t2
Packit c4476c
	vadd.i32	$c2,$c2,$t2
Packit c4476c
	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
Packit c4476c
	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
Packit c4476c
Packit c4476c
	vadd.i32	$d0,$d0,$t3
Packit c4476c
	vadd.i32	$d1,$d1,$t3
Packit c4476c
	vadd.i32	$d2,$d2,$t3
Packit c4476c
Packit c4476c
	cmp		@t[3],#64*4
Packit c4476c
	blo		.Ltail_neon
Packit c4476c
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!	@ load input
Packit c4476c
	 mov		@t[3],sp
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
	veor		$a0,$a0,$t0		@ xor with input
Packit c4476c
	veor		$b0,$b0,$t1
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	veor		$c0,$c0,$t2
Packit c4476c
	veor		$d0,$d0,$t3
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
Packit c4476c
	veor		$a1,$a1,$t0
Packit c4476c
	 vst1.8		{$a0-$b0},[r14]!	@ store output
Packit c4476c
	veor		$b1,$b1,$t1
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	veor		$c1,$c1,$t2
Packit c4476c
	 vst1.8		{$c0-$d0},[r14]!
Packit c4476c
	veor		$d1,$d1,$t3
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
Packit c4476c
	veor		$a2,$a2,$t0
Packit c4476c
	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
Packit c4476c
	 veor		$t0#hi,$t0#hi,$t0#hi
Packit c4476c
	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
Packit c4476c
	veor		$b2,$b2,$t1
Packit c4476c
	 vld1.32	{$c0-$d0},[@t[3]]
Packit c4476c
	veor		$c2,$c2,$t2
Packit c4476c
	 vst1.8		{$a1-$b1},[r14]!
Packit c4476c
	veor		$d2,$d2,$t3
Packit c4476c
	 vst1.8		{$c1-$d1},[r14]!
Packit c4476c
Packit c4476c
	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
Packit c4476c
	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
Packit c4476c
Packit c4476c
	ldmia		sp,{@t[0]-@t[3]}	@ load key material
Packit c4476c
	add		@x[0],@x[0],@t[0]	@ accumulate key material
Packit c4476c
	ldr		@t[0],[r12],#16		@ load input
Packit c4476c
	 vst1.8		{$a2-$b2},[r14]!
Packit c4476c
	add		@x[1],@x[1],@t[1]
Packit c4476c
	ldr		@t[1],[r12,#-12]
Packit c4476c
	 vst1.8		{$c2-$d2},[r14]!
Packit c4476c
	add		@x[2],@x[2],@t[2]
Packit c4476c
	ldr		@t[2],[r12,#-8]
Packit c4476c
	add		@x[3],@x[3],@t[3]
Packit c4476c
	ldr		@t[3],[r12,#-4]
Packit c4476c
# ifdef	__ARMEB__
Packit c4476c
	rev		@x[0],@x[0]
Packit c4476c
	rev		@x[1],@x[1]
Packit c4476c
	rev		@x[2],@x[2]
Packit c4476c
	rev		@x[3],@x[3]
Packit c4476c
# endif
Packit c4476c
	eor		@x[0],@x[0],@t[0]	@ xor with input
Packit c4476c
	 add		@t[0],sp,#4*(4)
Packit c4476c
	eor		@x[1],@x[1],@t[1]
Packit c4476c
	str		@x[0],[r14],#16		@ store output
Packit c4476c
	eor		@x[2],@x[2],@t[2]
Packit c4476c
	str		@x[1],[r14,#-12]
Packit c4476c
	eor		@x[3],@x[3],@t[3]
Packit c4476c
	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
	str		@x[2],[r14,#-8]
Packit c4476c
	str		@x[3],[r14,#-4]
Packit c4476c
Packit c4476c
	add		@x[4],@x[4],@t[0]	@ accumulate key material
Packit c4476c
	ldr		@t[0],[r12],#16		@ load input
Packit c4476c
	add		@x[5],@x[5],@t[1]
Packit c4476c
	ldr		@t[1],[r12,#-12]
Packit c4476c
	add		@x[6],@x[6],@t[2]
Packit c4476c
	ldr		@t[2],[r12,#-8]
Packit c4476c
	add		@x[7],@x[7],@t[3]
Packit c4476c
	ldr		@t[3],[r12,#-4]
Packit c4476c
# ifdef	__ARMEB__
Packit c4476c
	rev		@x[4],@x[4]
Packit c4476c
	rev		@x[5],@x[5]
Packit c4476c
	rev		@x[6],@x[6]
Packit c4476c
	rev		@x[7],@x[7]
Packit c4476c
# endif
Packit c4476c
	eor		@x[4],@x[4],@t[0]
Packit c4476c
	 add		@t[0],sp,#4*(8)
Packit c4476c
	eor		@x[5],@x[5],@t[1]
Packit c4476c
	str		@x[4],[r14],#16		@ store output
Packit c4476c
	eor		@x[6],@x[6],@t[2]
Packit c4476c
	str		@x[5],[r14,#-12]
Packit c4476c
	eor		@x[7],@x[7],@t[3]
Packit c4476c
	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
	str		@x[6],[r14,#-8]
Packit c4476c
	 add		@x[0],sp,#4*(16+8)
Packit c4476c
	str		@x[7],[r14,#-4]
Packit c4476c
Packit c4476c
	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
Packit c4476c
Packit c4476c
	add		@x[0],@x[0],@t[0]	@ accumulate key material
Packit c4476c
	ldr		@t[0],[r12],#16		@ load input
Packit c4476c
	add		@x[1],@x[1],@t[1]
Packit c4476c
	ldr		@t[1],[r12,#-12]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	it	hi
Packit c4476c
# endif
Packit c4476c
	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
Packit c4476c
	add		@x[2],@x[2],@t[2]
Packit c4476c
	ldr		@t[2],[r12,#-8]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	it	hi
Packit c4476c
# endif
Packit c4476c
	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
Packit c4476c
	add		@x[3],@x[3],@t[3]
Packit c4476c
	ldr		@t[3],[r12,#-4]
Packit c4476c
# ifdef	__ARMEB__
Packit c4476c
	rev		@x[0],@x[0]
Packit c4476c
	rev		@x[1],@x[1]
Packit c4476c
	rev		@x[2],@x[2]
Packit c4476c
	rev		@x[3],@x[3]
Packit c4476c
# endif
Packit c4476c
	eor		@x[0],@x[0],@t[0]
Packit c4476c
	 add		@t[0],sp,#4*(12)
Packit c4476c
	eor		@x[1],@x[1],@t[1]
Packit c4476c
	str		@x[0],[r14],#16		@ store output
Packit c4476c
	eor		@x[2],@x[2],@t[2]
Packit c4476c
	str		@x[1],[r14,#-12]
Packit c4476c
	eor		@x[3],@x[3],@t[3]
Packit c4476c
	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
	str		@x[2],[r14,#-8]
Packit c4476c
	str		@x[3],[r14,#-4]
Packit c4476c
Packit c4476c
	add		@x[4],@x[4],@t[0]	@ accumulate key material
Packit c4476c
	 add		@t[0],@t[0],#4		@ next counter value
Packit c4476c
	add		@x[5],@x[5],@t[1]
Packit c4476c
	 str		@t[0],[sp,#4*(12)]	@ save next counter value
Packit c4476c
	ldr		@t[0],[r12],#16		@ load input
Packit c4476c
	add		@x[6],@x[6],@t[2]
Packit c4476c
	 add		@x[4],@x[4],#3		@ counter+3
Packit c4476c
	ldr		@t[1],[r12,#-12]
Packit c4476c
	add		@x[7],@x[7],@t[3]
Packit c4476c
	ldr		@t[2],[r12,#-8]
Packit c4476c
	ldr		@t[3],[r12,#-4]
Packit c4476c
# ifdef	__ARMEB__
Packit c4476c
	rev		@x[4],@x[4]
Packit c4476c
	rev		@x[5],@x[5]
Packit c4476c
	rev		@x[6],@x[6]
Packit c4476c
	rev		@x[7],@x[7]
Packit c4476c
# endif
Packit c4476c
	eor		@x[4],@x[4],@t[0]
Packit c4476c
# ifdef	__thumb2__
Packit c4476c
	it	hi
Packit c4476c
# endif
Packit c4476c
	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
Packit c4476c
	eor		@x[5],@x[5],@t[1]
Packit c4476c
	eor		@x[6],@x[6],@t[2]
Packit c4476c
	str		@x[4],[r14],#16		@ store output
Packit c4476c
	eor		@x[7],@x[7],@t[3]
Packit c4476c
	str		@x[5],[r14,#-12]
Packit c4476c
	 sub		@t[3],@t[0],#64*4	@ len-=64*4
Packit c4476c
	str		@x[6],[r14,#-8]
Packit c4476c
	str		@x[7],[r14,#-4]
Packit c4476c
	bhi		.Loop_neon_outer
Packit c4476c
Packit c4476c
	b		.Ldone_neon
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Lbreak_neon:
Packit c4476c
	@ harmonize NEON and integer-only stack frames: load data
Packit c4476c
	@ from NEON frame, but save to integer-only one; distance
Packit c4476c
	@ between the two is 4*(32+4+16-32)=4*(20).
Packit c4476c
Packit c4476c
	str		@t[3], [sp,#4*(20+32+2)]	@ save len
Packit c4476c
	 add		@t[3],sp,#4*(32+4)
Packit c4476c
	str		r12,   [sp,#4*(20+32+1)]	@ save inp
Packit c4476c
	str		r14,   [sp,#4*(20+32+0)]	@ save out
Packit c4476c
Packit c4476c
	ldr		@x[12],[sp,#4*(16+10)]
Packit c4476c
	ldr		@x[14],[sp,#4*(16+11)]
Packit c4476c
	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
Packit c4476c
	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
Packit c4476c
	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
Packit c4476c
Packit c4476c
	ldr		@t[3], [sp,#4*(15)]
Packit c4476c
	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
Packit c4476c
	ldr		@t[2], [sp,#4*(13)]
Packit c4476c
	ldr		@x[14],[sp,#4*(14)]
Packit c4476c
	str		@t[3], [sp,#4*(20+16+15)]
Packit c4476c
	add		@t[3],sp,#4*(20)
Packit c4476c
	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
Packit c4476c
	add		sp,sp,#4*(20)			@ switch frame
Packit c4476c
	vst1.32		{$c0-$d0},[@t[3]]
Packit c4476c
	mov		@t[3],#10
Packit c4476c
	b		.Loop				@ go integer-only
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.Ltail_neon:
Packit c4476c
	cmp		@t[3],#64*3
Packit c4476c
	bhs		.L192_or_more_neon
Packit c4476c
	cmp		@t[3],#64*2
Packit c4476c
	bhs		.L128_or_more_neon
Packit c4476c
	cmp		@t[3],#64*1
Packit c4476c
	bhs		.L64_or_more_neon
Packit c4476c
Packit c4476c
	add		@t[0],sp,#4*(8)
Packit c4476c
	vst1.8		{$a0-$b0},[sp]
Packit c4476c
	add		@t[2],sp,#4*(0)
Packit c4476c
	vst1.8		{$c0-$d0},[@t[0]]
Packit c4476c
	b		.Loop_tail_neon
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.L64_or_more_neon:
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
	veor		$a0,$a0,$t0
Packit c4476c
	veor		$b0,$b0,$t1
Packit c4476c
	veor		$c0,$c0,$t2
Packit c4476c
	veor		$d0,$d0,$t3
Packit c4476c
	vst1.8		{$a0-$b0},[r14]!
Packit c4476c
	vst1.8		{$c0-$d0},[r14]!
Packit c4476c
Packit c4476c
	beq		.Ldone_neon
Packit c4476c
Packit c4476c
	add		@t[0],sp,#4*(8)
Packit c4476c
	vst1.8		{$a1-$b1},[sp]
Packit c4476c
	add		@t[2],sp,#4*(0)
Packit c4476c
	vst1.8		{$c1-$d1},[@t[0]]
Packit c4476c
	sub		@t[3],@t[3],#64*1	@ len-=64*1
Packit c4476c
	b		.Loop_tail_neon
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.L128_or_more_neon:
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
	veor		$a0,$a0,$t0
Packit c4476c
	veor		$b0,$b0,$t1
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	veor		$c0,$c0,$t2
Packit c4476c
	veor		$d0,$d0,$t3
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
Packit c4476c
	veor		$a1,$a1,$t0
Packit c4476c
	veor		$b1,$b1,$t1
Packit c4476c
	 vst1.8		{$a0-$b0},[r14]!
Packit c4476c
	veor		$c1,$c1,$t2
Packit c4476c
	 vst1.8		{$c0-$d0},[r14]!
Packit c4476c
	veor		$d1,$d1,$t3
Packit c4476c
	vst1.8		{$a1-$b1},[r14]!
Packit c4476c
	vst1.8		{$c1-$d1},[r14]!
Packit c4476c
Packit c4476c
	beq		.Ldone_neon
Packit c4476c
Packit c4476c
	add		@t[0],sp,#4*(8)
Packit c4476c
	vst1.8		{$a2-$b2},[sp]
Packit c4476c
	add		@t[2],sp,#4*(0)
Packit c4476c
	vst1.8		{$c2-$d2},[@t[0]]
Packit c4476c
	sub		@t[3],@t[3],#64*2	@ len-=64*2
Packit c4476c
	b		.Loop_tail_neon
Packit c4476c
Packit c4476c
.align	4
Packit c4476c
.L192_or_more_neon:
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
	veor		$a0,$a0,$t0
Packit c4476c
	veor		$b0,$b0,$t1
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	veor		$c0,$c0,$t2
Packit c4476c
	veor		$d0,$d0,$t3
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
Packit c4476c
	veor		$a1,$a1,$t0
Packit c4476c
	veor		$b1,$b1,$t1
Packit c4476c
	vld1.8		{$t0-$t1},[r12]!
Packit c4476c
	veor		$c1,$c1,$t2
Packit c4476c
	 vst1.8		{$a0-$b0},[r14]!
Packit c4476c
	veor		$d1,$d1,$t3
Packit c4476c
	vld1.8		{$t2-$t3},[r12]!
Packit c4476c
Packit c4476c
	veor		$a2,$a2,$t0
Packit c4476c
	 vst1.8		{$c0-$d0},[r14]!
Packit c4476c
	veor		$b2,$b2,$t1
Packit c4476c
	 vst1.8		{$a1-$b1},[r14]!
Packit c4476c
	veor		$c2,$c2,$t2
Packit c4476c
	 vst1.8		{$c1-$d1},[r14]!
Packit c4476c
	veor		$d2,$d2,$t3
Packit c4476c
	vst1.8		{$a2-$b2},[r14]!
Packit c4476c
	vst1.8		{$c2-$d2},[r14]!
Packit c4476c
Packit c4476c
	beq		.Ldone_neon
Packit c4476c
Packit c4476c
	ldmia		sp,{@t[0]-@t[3]}	@ load key material
Packit c4476c
	add		@x[0],@x[0],@t[0]	@ accumulate key material
Packit c4476c
	 add		@t[0],sp,#4*(4)
Packit c4476c
	add		@x[1],@x[1],@t[1]
Packit c4476c
	add		@x[2],@x[2],@t[2]
Packit c4476c
	add		@x[3],@x[3],@t[3]
Packit c4476c
	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
Packit c4476c
	add		@x[4],@x[4],@t[0]	@ accumulate key material
Packit c4476c
	 add		@t[0],sp,#4*(8)
Packit c4476c
	add		@x[5],@x[5],@t[1]
Packit c4476c
	add		@x[6],@x[6],@t[2]
Packit c4476c
	add		@x[7],@x[7],@t[3]
Packit c4476c
	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
# ifdef	__ARMEB__
Packit c4476c
	rev		@x[0],@x[0]
Packit c4476c
	rev		@x[1],@x[1]
Packit c4476c
	rev		@x[2],@x[2]
Packit c4476c
	rev		@x[3],@x[3]
Packit c4476c
	rev		@x[4],@x[4]
Packit c4476c
	rev		@x[5],@x[5]
Packit c4476c
	rev		@x[6],@x[6]
Packit c4476c
	rev		@x[7],@x[7]
Packit c4476c
# endif
Packit c4476c
	stmia		sp,{@x[0]-@x[7]}
Packit c4476c
	 add		@x[0],sp,#4*(16+8)
Packit c4476c
Packit c4476c
	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
Packit c4476c
Packit c4476c
	add		@x[0],@x[0],@t[0]	@ accumulate key material
Packit c4476c
	 add		@t[0],sp,#4*(12)
Packit c4476c
	add		@x[1],@x[1],@t[1]
Packit c4476c
	add		@x[2],@x[2],@t[2]
Packit c4476c
	add		@x[3],@x[3],@t[3]
Packit c4476c
	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
Packit c4476c
Packit c4476c
	add		@x[4],@x[4],@t[0]	@ accumulate key material
Packit c4476c
	 add		@t[0],sp,#4*(8)
Packit c4476c
	add		@x[5],@x[5],@t[1]
Packit c4476c
	 add		@x[4],@x[4],#3		@ counter+3
Packit c4476c
	add		@x[6],@x[6],@t[2]
Packit c4476c
	add		@x[7],@x[7],@t[3]
Packit c4476c
	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
Packit c4476c
# ifdef	__ARMEB__
Packit c4476c
	rev		@x[0],@x[0]
Packit c4476c
	rev		@x[1],@x[1]
Packit c4476c
	rev		@x[2],@x[2]
Packit c4476c
	rev		@x[3],@x[3]
Packit c4476c
	rev		@x[4],@x[4]
Packit c4476c
	rev		@x[5],@x[5]
Packit c4476c
	rev		@x[6],@x[6]
Packit c4476c
	rev		@x[7],@x[7]
Packit c4476c
# endif
Packit c4476c
	stmia		@t[0],{@x[0]-@x[7]}
Packit c4476c
	 add		@t[2],sp,#4*(0)
Packit c4476c
	 sub		@t[3],@t[3],#64*3	@ len-=64*3
Packit c4476c
Packit c4476c
.Loop_tail_neon:
Packit c4476c
	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
Packit c4476c
	ldrb		@t[1],[r12],#1		@ read input
Packit c4476c
	subs		@t[3],@t[3],#1
Packit c4476c
	eor		@t[0],@t[0],@t[1]
Packit c4476c
	strb		@t[0],[r14],#1		@ store output
Packit c4476c
	bne		.Loop_tail_neon
Packit c4476c
Packit c4476c
.Ldone_neon:
Packit c4476c
	add		sp,sp,#4*(32+4)
Packit c4476c
	vldmia		sp,{d8-d15}
Packit c4476c
	add		sp,sp,#4*(16+3)
Packit c4476c
	ldmia		sp!,{r4-r11,pc}
Packit c4476c
.size	ChaCha20_neon,.-ChaCha20_neon
Packit c4476c
.comm	OPENSSL_armcap_P,4,4
Packit c4476c
#endif
Packit c4476c
___
Packit c4476c
}}}
Packit c4476c
Packit c4476c
foreach (split("\n",$code)) {
Packit c4476c
	s/\`([^\`]*)\`/eval $1/geo;
Packit c4476c
Packit c4476c
	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
Packit c4476c
Packit c4476c
	print $_,"\n";
Packit c4476c
}
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";