Blame crypto/camellia/asm/cmll-x86.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
Packit c4476c
# ====================================================================
Packit c4476c
# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
Packit c4476c
#
Packit c4476c
# This module may be used under the terms of either the GNU General
Packit c4476c
# Public License version 2 or later, the GNU Lesser General Public
Packit c4476c
# License version 2.1 or later, the Mozilla Public License version
Packit c4476c
# 1.1 or the BSD License. The exact terms of either license are
Packit c4476c
# distributed along with this module. For further details see
Packit c4476c
# http://www.openssl.org/~appro/camellia/.
Packit c4476c
# ====================================================================
Packit c4476c
Packit c4476c
# Performance in cycles per processed byte (less is better) in
Packit c4476c
# 'openssl speed ...' benchmark:
Packit c4476c
#
Packit c4476c
#			AMD K8	Core2	PIII	P4
Packit c4476c
# -evp camellia-128-ecb	21.5	22.8	27.0	28.9
Packit c4476c
# + over gcc 3.4.6	+90/11% +70/10%	+53/4%	+160/64%
Packit c4476c
# + over icc 8.0	+48/19% +21/15%	+21/17%	+55/37%
Packit c4476c
#
Packit c4476c
# camellia-128-cbc	17.3	21.1	23.9	25.9
Packit c4476c
#
Packit c4476c
# 128-bit key setup	196	280	256	240	cycles/key
Packit c4476c
# + over gcc 3.4.6	+30/0%	+17/11%	+11/0%	+63/40%
Packit c4476c
# + over icc 8.0	+18/3%	+10/0%	+10/3%	+21/10%
Packit c4476c
#
Packit c4476c
# Pairs of numbers in "+" rows represent performance improvement over
Packit c4476c
# compiler generated position-independent code, PIC, and non-PIC
Packit c4476c
# respectively. PIC results are of greater relevance, as this module
Packit c4476c
# is position-independent, i.e. suitable for a shared library or PIE.
Packit c4476c
# Position independence "costs" one register, which is why compilers
Packit c4476c
# are so close with non-PIC results, they have an extra register to
Packit c4476c
# spare. CBC results are better than ECB ones thanks to "zero-copy"
Packit c4476c
# private _x86_* interface, and are ~30-40% better than with compiler
Packit c4476c
# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
Packit c4476c
# same CPU (where applicable).
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Packit c4476c
push(@INC,"${dir}","${dir}../../perlasm");
Packit c4476c
require "x86asm.pl";
Packit c4476c
Packit c4476c
$OPENSSL=1;
Packit c4476c
Packit c4476c
$output = pop;
Packit c4476c
open STDOUT,">$output";
Packit c4476c
Packit c4476c
&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
Packit c4476c
Packit c4476c
@T=("eax","ebx","ecx","edx");
Packit c4476c
$idx="esi";
Packit c4476c
$key="edi";
Packit c4476c
$Tbl="ebp";
Packit c4476c
Packit c4476c
# stack frame layout in _x86_Camellia_* routines, frame is allocated
Packit c4476c
# by caller
Packit c4476c
$__ra=&DWP(0,"esp");	# return address
Packit c4476c
$__s0=&DWP(4,"esp");	# s0 backing store
Packit c4476c
$__s1=&DWP(8,"esp");	# s1 backing store
Packit c4476c
$__s2=&DWP(12,"esp");	# s2 backing store
Packit c4476c
$__s3=&DWP(16,"esp");	# s3 backing store
Packit c4476c
$__end=&DWP(20,"esp");	# pointer to end/start of key schedule
Packit c4476c
Packit c4476c
# stack frame layout in Camellia_[en|crypt] routines, which differs from
Packit c4476c
# above by 4 and overlaps by pointer to end/start of key schedule
Packit c4476c
$_end=&DWP(16,"esp");
Packit c4476c
$_esp=&DWP(20,"esp");
Packit c4476c
Packit c4476c
# const unsigned int Camellia_SBOX[4][256];
Packit c4476c
# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
Packit c4476c
# and [2][] - with [3][]. This is done to optimize code size.
Packit c4476c
$SBOX1_1110=0;		# Camellia_SBOX[0]
Packit c4476c
$SBOX4_4404=4;		# Camellia_SBOX[1]
Packit c4476c
$SBOX2_0222=2048;	# Camellia_SBOX[2]
Packit c4476c
$SBOX3_3033=2052;	# Camellia_SBOX[3]
Packit c4476c
&static_label("Camellia_SIGMA");
Packit c4476c
&static_label("Camellia_SBOX");
Packit c4476c
Packit c4476c
sub Camellia_Feistel {
Packit c4476c
my $i=@_[0];
Packit c4476c
my $seed=defined(@_[1])?@_[1]:0;
Packit c4476c
my $scale=$seed<0?-8:8;
Packit c4476c
my $frame=defined(@_[2])?@_[2]:0;
Packit c4476c
my $j=($i&1)*2;
Packit c4476c
my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
Packit c4476c
Packit c4476c
	&xor	($t0,$idx);				# t0^=key[0]
Packit c4476c
	&xor	($t1,&DWP($seed+$i*$scale+4,$key));	# t1^=key[1]
Packit c4476c
	&movz	($idx,&HB($t0));			# (t0>>8)&0xff
Packit c4476c
	&mov	($t3,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t3=SBOX3_3033[0]
Packit c4476c
	&movz	($idx,&LB($t0));			# (t0>>0)&0xff
Packit c4476c
	&xor	($t3,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t3^=SBOX4_4404[0]
Packit c4476c
	&shr	($t0,16);
Packit c4476c
	&movz	($idx,&LB($t1));			# (t1>>0)&0xff
Packit c4476c
	&mov	($t2,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t2=SBOX1_1110[1]
Packit c4476c
	&movz	($idx,&HB($t0));			# (t0>>24)&0xff
Packit c4476c
	&xor	($t3,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t3^=SBOX1_1110[0]
Packit c4476c
	&movz	($idx,&HB($t1));			# (t1>>8)&0xff
Packit c4476c
	&xor	($t2,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t2^=SBOX4_4404[1]
Packit c4476c
	&shr	($t1,16);
Packit c4476c
	&movz	($t0,&LB($t0));				# (t0>>16)&0xff
Packit c4476c
	&xor	($t3,&DWP($SBOX2_0222,$Tbl,$t0,8));	# t3^=SBOX2_0222[0]
Packit c4476c
	&movz	($idx,&HB($t1));			# (t1>>24)&0xff
Packit c4476c
	&mov	($t0,&DWP($frame+4*(($j+3)%4),"esp"));	# prefetch "s3"
Packit c4476c
	&xor	($t2,$t3);				# t2^=t3
Packit c4476c
	&rotr	($t3,8);				# t3=RightRotate(t3,8)
Packit c4476c
	&xor	($t2,&DWP($SBOX2_0222,$Tbl,$idx,8));	# t2^=SBOX2_0222[1]
Packit c4476c
	&movz	($idx,&LB($t1));			# (t1>>16)&0xff
Packit c4476c
	&mov	($t1,&DWP($frame+4*(($j+2)%4),"esp"));	# prefetch "s2"
Packit c4476c
	&xor	($t3,$t0);				# t3^=s3
Packit c4476c
	&xor	($t2,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t2^=SBOX3_3033[1]
Packit c4476c
	&mov	($idx,&DWP($seed+($i+1)*$scale,$key));	# prefetch key[i+1]
Packit c4476c
	&xor	($t3,$t2);				# t3^=t2
Packit c4476c
	&mov	(&DWP($frame+4*(($j+3)%4),"esp"),$t3);	# s3=t3
Packit c4476c
	&xor	($t2,$t1);				# t2^=s2
Packit c4476c
	&mov	(&DWP($frame+4*(($j+2)%4),"esp"),$t2);	# s2=t2
Packit c4476c
}
Packit c4476c
Packit c4476c
# void Camellia_EncryptBlock_Rounds(
Packit c4476c
#		int grandRounds,
Packit c4476c
#		const Byte plaintext[],
Packit c4476c
#		const KEY_TABLE_TYPE keyTable,
Packit c4476c
#		Byte ciphertext[])
Packit c4476c
&function_begin("Camellia_EncryptBlock_Rounds");
Packit c4476c
	&mov	("eax",&wparam(0));	# load grandRounds
Packit c4476c
	&mov	($idx,&wparam(1));	# load plaintext pointer
Packit c4476c
	&mov	($key,&wparam(2));	# load key schedule pointer
Packit c4476c
Packit c4476c
	&mov	("ebx","esp");
Packit c4476c
	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
Packit c4476c
	&and	("esp",-64);
Packit c4476c
Packit c4476c
	# place stack frame just "above mod 1024" the key schedule
Packit c4476c
	# this ensures that cache associativity of 2 suffices
Packit c4476c
	&lea	("ecx",&DWP(-64-63,$key));
Packit c4476c
	&sub	("ecx","esp");
Packit c4476c
	&neg	("ecx");
Packit c4476c
	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
Packit c4476c
	&sub	("esp","ecx");
Packit c4476c
	&add	("esp",4);	# 4 is reserved for callee's return address
Packit c4476c
Packit c4476c
	&shl	("eax",6);
Packit c4476c
	&lea	("eax",&DWP(0,$key,"eax"));
Packit c4476c
	&mov	($_esp,"ebx");	# save %esp
Packit c4476c
	&mov	($_end,"eax");	# save keyEnd
Packit c4476c
Packit c4476c
	&call	(&label("pic_point"));
Packit c4476c
	&set_label("pic_point");
Packit c4476c
	&blindpop($Tbl);
Packit c4476c
	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
Packit c4476c
	&mov	(@T[1],&DWP(4,$idx));
Packit c4476c
	&mov	(@T[2],&DWP(8,$idx));
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	(@T[3],&DWP(12,$idx));
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
Packit c4476c
	&call	("_x86_Camellia_encrypt");
Packit c4476c
Packit c4476c
	&mov	("esp",$_esp);
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	($idx,&wparam(3));	# load ciphertext pointer
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
Packit c4476c
	&mov	(&DWP(4,$idx),@T[1]);
Packit c4476c
	&mov	(&DWP(8,$idx),@T[2]);
Packit c4476c
	&mov	(&DWP(12,$idx),@T[3]);
Packit c4476c
&function_end("Camellia_EncryptBlock_Rounds");
Packit c4476c
# V1.x API
Packit c4476c
&function_begin_B("Camellia_EncryptBlock");
Packit c4476c
	&mov	("eax",128);
Packit c4476c
	&sub	("eax",&wparam(0));	# load keyBitLength
Packit c4476c
	&mov	("eax",3);
Packit c4476c
	&adc	("eax",0);		# keyBitLength==128?3:4
Packit c4476c
	&mov	(&wparam(0),"eax");
Packit c4476c
	&jmp	(&label("Camellia_EncryptBlock_Rounds"));
Packit c4476c
&function_end_B("Camellia_EncryptBlock");
Packit c4476c
Packit c4476c
if ($OPENSSL) {
Packit c4476c
# void Camellia_encrypt(
Packit c4476c
#		const unsigned char *in,
Packit c4476c
#		unsigned char *out,
Packit c4476c
#		const CAMELLIA_KEY *key)
Packit c4476c
&function_begin("Camellia_encrypt");
Packit c4476c
	&mov	($idx,&wparam(0));	# load plaintext pointer
Packit c4476c
	&mov	($key,&wparam(2));	# load key schedule pointer
Packit c4476c
Packit c4476c
	&mov	("ebx","esp");
Packit c4476c
	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
Packit c4476c
	&and	("esp",-64);
Packit c4476c
	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
Packit c4476c
Packit c4476c
	# place stack frame just "above mod 1024" the key schedule
Packit c4476c
	# this ensures that cache associativity of 2 suffices
Packit c4476c
	&lea	("ecx",&DWP(-64-63,$key));
Packit c4476c
	&sub	("ecx","esp");
Packit c4476c
	&neg	("ecx");
Packit c4476c
	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
Packit c4476c
	&sub	("esp","ecx");
Packit c4476c
	&add	("esp",4);	# 4 is reserved for callee's return address
Packit c4476c
Packit c4476c
	&shl	("eax",6);
Packit c4476c
	&lea	("eax",&DWP(0,$key,"eax"));
Packit c4476c
	&mov	($_esp,"ebx");	# save %esp
Packit c4476c
	&mov	($_end,"eax");	# save keyEnd
Packit c4476c
Packit c4476c
	&call	(&label("pic_point"));
Packit c4476c
	&set_label("pic_point");
Packit c4476c
	&blindpop($Tbl);
Packit c4476c
	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
Packit c4476c
	&mov	(@T[1],&DWP(4,$idx));
Packit c4476c
	&mov	(@T[2],&DWP(8,$idx));
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	(@T[3],&DWP(12,$idx));
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
Packit c4476c
	&call	("_x86_Camellia_encrypt");
Packit c4476c
Packit c4476c
	&mov	("esp",$_esp);
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	($idx,&wparam(1));	# load ciphertext pointer
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
Packit c4476c
	&mov	(&DWP(4,$idx),@T[1]);
Packit c4476c
	&mov	(&DWP(8,$idx),@T[2]);
Packit c4476c
	&mov	(&DWP(12,$idx),@T[3]);
Packit c4476c
&function_end("Camellia_encrypt");
Packit c4476c
}
Packit c4476c
Packit c4476c
&function_begin_B("_x86_Camellia_encrypt");
Packit c4476c
	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
Packit c4476c
	&xor	(@T[1],&DWP(4,$key));
Packit c4476c
	&xor	(@T[2],&DWP(8,$key));
Packit c4476c
	&xor	(@T[3],&DWP(12,$key));
Packit c4476c
	&mov	($idx,&DWP(16,$key));	# prefetch key[4]
Packit c4476c
Packit c4476c
	&mov	($__s0,@T[0]);		# save s[0-3]
Packit c4476c
	&mov	($__s1,@T[1]);
Packit c4476c
	&mov	($__s2,@T[2]);
Packit c4476c
	&mov	($__s3,@T[3]);
Packit c4476c
Packit c4476c
&set_label("loop",16);
Packit c4476c
	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
Packit c4476c
Packit c4476c
	&add	($key,16*4);
Packit c4476c
	&cmp	($key,$__end);
Packit c4476c
	&je	(&label("done"));
Packit c4476c
Packit c4476c
	# @T[0-1] are preloaded, $idx is preloaded with key[0]
Packit c4476c
	&and	($idx,@T[0]);
Packit c4476c
	 &mov	 (@T[3],$__s3);
Packit c4476c
	&rotl	($idx,1);
Packit c4476c
	 &mov	 (@T[2],@T[3]);
Packit c4476c
	&xor	(@T[1],$idx);
Packit c4476c
	 &or	 (@T[2],&DWP(12,$key));
Packit c4476c
	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
Packit c4476c
	 &xor	 (@T[2],$__s2);
Packit c4476c
Packit c4476c
	&mov	($idx,&DWP(4,$key));
Packit c4476c
	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
Packit c4476c
	&or	($idx,@T[1]);
Packit c4476c
	 &and	 (@T[2],&DWP(8,$key));
Packit c4476c
	&xor	(@T[0],$idx);
Packit c4476c
	 &rotl	 (@T[2],1);
Packit c4476c
	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
Packit c4476c
	 &xor	 (@T[3],@T[2]);
Packit c4476c
	&mov	($idx,&DWP(16,$key));		# prefetch key[4]
Packit c4476c
	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
Packit c4476c
	&jmp	(&label("loop"));
Packit c4476c
Packit c4476c
&set_label("done",8);
Packit c4476c
	&mov	(@T[2],@T[0]);		# SwapHalf
Packit c4476c
	&mov	(@T[3],@T[1]);
Packit c4476c
	&mov	(@T[0],$__s2);
Packit c4476c
	&mov	(@T[1],$__s3);
Packit c4476c
	&xor	(@T[0],$idx);		# $idx is preloaded with key[0]
Packit c4476c
	&xor	(@T[1],&DWP(4,$key));
Packit c4476c
	&xor	(@T[2],&DWP(8,$key));
Packit c4476c
	&xor	(@T[3],&DWP(12,$key));
Packit c4476c
	&ret	();
Packit c4476c
&function_end_B("_x86_Camellia_encrypt");
Packit c4476c
Packit c4476c
# void Camellia_DecryptBlock_Rounds(
Packit c4476c
#		int grandRounds,
Packit c4476c
#		const Byte ciphertext[],
Packit c4476c
#		const KEY_TABLE_TYPE keyTable,
Packit c4476c
#		Byte plaintext[])
Packit c4476c
&function_begin("Camellia_DecryptBlock_Rounds");
Packit c4476c
	&mov	("eax",&wparam(0));	# load grandRounds
Packit c4476c
	&mov	($idx,&wparam(1));	# load ciphertext pointer
Packit c4476c
	&mov	($key,&wparam(2));	# load key schedule pointer
Packit c4476c
Packit c4476c
	&mov	("ebx","esp");
Packit c4476c
	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
Packit c4476c
	&and	("esp",-64);
Packit c4476c
Packit c4476c
	# place stack frame just "above mod 1024" the key schedule
Packit c4476c
	# this ensures that cache associativity of 2 suffices
Packit c4476c
	&lea	("ecx",&DWP(-64-63,$key));
Packit c4476c
	&sub	("ecx","esp");
Packit c4476c
	&neg	("ecx");
Packit c4476c
	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
Packit c4476c
	&sub	("esp","ecx");
Packit c4476c
	&add	("esp",4);	# 4 is reserved for callee's return address
Packit c4476c
Packit c4476c
	&shl	("eax",6);
Packit c4476c
	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
Packit c4476c
	&lea	($key,&DWP(0,$key,"eax"));
Packit c4476c
	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
Packit c4476c
Packit c4476c
	&call	(&label("pic_point"));
Packit c4476c
	&set_label("pic_point");
Packit c4476c
	&blindpop($Tbl);
Packit c4476c
	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
Packit c4476c
	&mov	(@T[1],&DWP(4,$idx));
Packit c4476c
	&mov	(@T[2],&DWP(8,$idx));
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	(@T[3],&DWP(12,$idx));
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
Packit c4476c
	&call	("_x86_Camellia_decrypt");
Packit c4476c
Packit c4476c
	&mov	("esp",&DWP(5*4,"esp"));
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	($idx,&wparam(3));	# load plaintext pointer
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
Packit c4476c
	&mov	(&DWP(4,$idx),@T[1]);
Packit c4476c
	&mov	(&DWP(8,$idx),@T[2]);
Packit c4476c
	&mov	(&DWP(12,$idx),@T[3]);
Packit c4476c
&function_end("Camellia_DecryptBlock_Rounds");
Packit c4476c
# V1.x API
Packit c4476c
&function_begin_B("Camellia_DecryptBlock");
Packit c4476c
	&mov	("eax",128);
Packit c4476c
	&sub	("eax",&wparam(0));	# load keyBitLength
Packit c4476c
	&mov	("eax",3);
Packit c4476c
	&adc	("eax",0);		# keyBitLength==128?3:4
Packit c4476c
	&mov	(&wparam(0),"eax");
Packit c4476c
	&jmp	(&label("Camellia_DecryptBlock_Rounds"));
Packit c4476c
&function_end_B("Camellia_DecryptBlock");
Packit c4476c
Packit c4476c
if ($OPENSSL) {
Packit c4476c
# void Camellia_decrypt(
Packit c4476c
#		const unsigned char *in,
Packit c4476c
#		unsigned char *out,
Packit c4476c
#		const CAMELLIA_KEY *key)
Packit c4476c
&function_begin("Camellia_decrypt");
Packit c4476c
	&mov	($idx,&wparam(0));	# load ciphertext pointer
Packit c4476c
	&mov	($key,&wparam(2));	# load key schedule pointer
Packit c4476c
Packit c4476c
	&mov	("ebx","esp");
Packit c4476c
	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
Packit c4476c
	&and	("esp",-64);
Packit c4476c
	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
Packit c4476c
Packit c4476c
	# place stack frame just "above mod 1024" the key schedule
Packit c4476c
	# this ensures that cache associativity of 2 suffices
Packit c4476c
	&lea	("ecx",&DWP(-64-63,$key));
Packit c4476c
	&sub	("ecx","esp");
Packit c4476c
	&neg	("ecx");
Packit c4476c
	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
Packit c4476c
	&sub	("esp","ecx");
Packit c4476c
	&add	("esp",4);	# 4 is reserved for callee's return address
Packit c4476c
Packit c4476c
	&shl	("eax",6);
Packit c4476c
	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
Packit c4476c
	&lea	($key,&DWP(0,$key,"eax"));
Packit c4476c
	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
Packit c4476c
Packit c4476c
	&call	(&label("pic_point"));
Packit c4476c
	&set_label("pic_point");
Packit c4476c
	&blindpop($Tbl);
Packit c4476c
	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
Packit c4476c
	&mov	(@T[1],&DWP(4,$idx));
Packit c4476c
	&mov	(@T[2],&DWP(8,$idx));
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	(@T[3],&DWP(12,$idx));
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
Packit c4476c
	&call	("_x86_Camellia_decrypt");
Packit c4476c
Packit c4476c
	&mov	("esp",&DWP(5*4,"esp"));
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&mov	($idx,&wparam(1));	# load plaintext pointer
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
Packit c4476c
	&mov	(&DWP(4,$idx),@T[1]);
Packit c4476c
	&mov	(&DWP(8,$idx),@T[2]);
Packit c4476c
	&mov	(&DWP(12,$idx),@T[3]);
Packit c4476c
&function_end("Camellia_decrypt");
Packit c4476c
}
Packit c4476c
Packit c4476c
&function_begin_B("_x86_Camellia_decrypt");
Packit c4476c
	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
Packit c4476c
	&xor	(@T[1],&DWP(4,$key));
Packit c4476c
	&xor	(@T[2],&DWP(8,$key));
Packit c4476c
	&xor	(@T[3],&DWP(12,$key));
Packit c4476c
	&mov	($idx,&DWP(-8,$key));	# prefetch key[-2]
Packit c4476c
Packit c4476c
	&mov	($__s0,@T[0]);		# save s[0-3]
Packit c4476c
	&mov	($__s1,@T[1]);
Packit c4476c
	&mov	($__s2,@T[2]);
Packit c4476c
	&mov	($__s3,@T[3]);
Packit c4476c
Packit c4476c
&set_label("loop",16);
Packit c4476c
	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
Packit c4476c
Packit c4476c
	&sub	($key,16*4);
Packit c4476c
	&cmp	($key,$__end);
Packit c4476c
	&je	(&label("done"));
Packit c4476c
Packit c4476c
	# @T[0-1] are preloaded, $idx is preloaded with key[2]
Packit c4476c
	&and	($idx,@T[0]);
Packit c4476c
	 &mov	 (@T[3],$__s3);
Packit c4476c
	&rotl	($idx,1);
Packit c4476c
	 &mov	 (@T[2],@T[3]);
Packit c4476c
	&xor	(@T[1],$idx);
Packit c4476c
	 &or	 (@T[2],&DWP(4,$key));
Packit c4476c
	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
Packit c4476c
	 &xor	 (@T[2],$__s2);
Packit c4476c
Packit c4476c
	&mov	($idx,&DWP(12,$key));
Packit c4476c
	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
Packit c4476c
	&or	($idx,@T[1]);
Packit c4476c
	 &and	 (@T[2],&DWP(0,$key));
Packit c4476c
	&xor	(@T[0],$idx);
Packit c4476c
	 &rotl	 (@T[2],1);
Packit c4476c
	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
Packit c4476c
	 &xor	 (@T[3],@T[2]);
Packit c4476c
	&mov	($idx,&DWP(-8,$key));	# prefetch key[4]
Packit c4476c
	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
Packit c4476c
	&jmp	(&label("loop"));
Packit c4476c
Packit c4476c
&set_label("done",8);
Packit c4476c
	&mov	(@T[2],@T[0]);		# SwapHalf
Packit c4476c
	&mov	(@T[3],@T[1]);
Packit c4476c
	&mov	(@T[0],$__s2);
Packit c4476c
	&mov	(@T[1],$__s3);
Packit c4476c
	&xor	(@T[2],$idx);		# $idx is preloaded with key[2]
Packit c4476c
	&xor	(@T[3],&DWP(12,$key));
Packit c4476c
	&xor	(@T[0],&DWP(0,$key));
Packit c4476c
	&xor	(@T[1],&DWP(4,$key));
Packit c4476c
	&ret	();
Packit c4476c
&function_end_B("_x86_Camellia_decrypt");
Packit c4476c
Packit c4476c
# shld is very slow on Intel P4 family. Even on AMD it limits
Packit c4476c
# instruction decode rate [because it's VectorPath] and consequently
Packit c4476c
# performance. PIII, PM and Core[2] seem to be the only ones which
Packit c4476c
# execute this code ~7% faster...
Packit c4476c
sub __rotl128 {
Packit c4476c
  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
Packit c4476c
Packit c4476c
    $rnd *= 2;
Packit c4476c
    if ($rot) {
Packit c4476c
	&mov	($idx,$i0);
Packit c4476c
	&shld	($i0,$i1,$rot);
Packit c4476c
	&shld	($i1,$i2,$rot);
Packit c4476c
	&shld	($i2,$i3,$rot);
Packit c4476c
	&shld	($i3,$idx,$rot);
Packit c4476c
    }
Packit c4476c
    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
Packit c4476c
    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
Packit c4476c
    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
Packit c4476c
    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
Packit c4476c
}
Packit c4476c
Packit c4476c
# ... Implementing 128-bit rotate without shld gives >3x performance
Packit c4476c
# improvement on P4, only ~7% degradation on other Intel CPUs and
Packit c4476c
# not worse performance on AMD. This is therefore preferred.
Packit c4476c
sub _rotl128 {
Packit c4476c
  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
Packit c4476c
Packit c4476c
    $rnd *= 2;
Packit c4476c
    if ($rot) {
Packit c4476c
	&mov	($Tbl,$i0);
Packit c4476c
	&shl	($i0,$rot);
Packit c4476c
	&mov	($idx,$i1);
Packit c4476c
	&shr	($idx,32-$rot);
Packit c4476c
	&shl	($i1,$rot);
Packit c4476c
	&or	($i0,$idx);
Packit c4476c
	&mov	($idx,$i2);
Packit c4476c
	&shl	($i2,$rot);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
Packit c4476c
	&shr	($idx,32-$rot);
Packit c4476c
	&or	($i1,$idx);
Packit c4476c
	&shr	($Tbl,32-$rot);
Packit c4476c
	&mov	($idx,$i3);
Packit c4476c
	&shr	($idx,32-$rot);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
Packit c4476c
	&shl	($i3,$rot);
Packit c4476c
	&or	($i2,$idx);
Packit c4476c
	&or	($i3,$Tbl);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
Packit c4476c
    } else {
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
Packit c4476c
	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
Packit c4476c
    }
Packit c4476c
}
Packit c4476c
Packit c4476c
sub _saveround {
Packit c4476c
my ($rnd,$key,@T)=@_;
Packit c4476c
my $bias=int(@T[0])?shift(@T):0;
Packit c4476c
Packit c4476c
	&mov	(&DWP($bias+$rnd*8+0,$key),@T[0]);
Packit c4476c
	&mov	(&DWP($bias+$rnd*8+4,$key),@T[1])	if ($#T>=1);
Packit c4476c
	&mov	(&DWP($bias+$rnd*8+8,$key),@T[2])	if ($#T>=2);
Packit c4476c
	&mov	(&DWP($bias+$rnd*8+12,$key),@T[3])	if ($#T>=3);
Packit c4476c
}
Packit c4476c
Packit c4476c
sub _loadround {
Packit c4476c
my ($rnd,$key,@T)=@_;
Packit c4476c
my $bias=int(@T[0])?shift(@T):0;
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP($bias+$rnd*8+0,$key));
Packit c4476c
	&mov	(@T[1],&DWP($bias+$rnd*8+4,$key))	if ($#T>=1);
Packit c4476c
	&mov	(@T[2],&DWP($bias+$rnd*8+8,$key))	if ($#T>=2);
Packit c4476c
	&mov	(@T[3],&DWP($bias+$rnd*8+12,$key))	if ($#T>=3);
Packit c4476c
}
Packit c4476c
Packit c4476c
# void Camellia_Ekeygen(
Packit c4476c
#		const int keyBitLength,
Packit c4476c
#		const Byte *rawKey,
Packit c4476c
#		KEY_TABLE_TYPE keyTable)
Packit c4476c
&function_begin("Camellia_Ekeygen");
Packit c4476c
{ my $step=0;
Packit c4476c
Packit c4476c
	&stack_push(4);				# place for s[0-3]
Packit c4476c
Packit c4476c
	&mov	($Tbl,&wparam(0));		# load arguments
Packit c4476c
	&mov	($idx,&wparam(1));
Packit c4476c
	&mov	($key,&wparam(2));
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP(0,$idx));		# load 0-127 bits
Packit c4476c
	&mov	(@T[1],&DWP(4,$idx));
Packit c4476c
	&mov	(@T[2],&DWP(8,$idx));
Packit c4476c
	&mov	(@T[3],&DWP(12,$idx));
Packit c4476c
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
Packit c4476c
	&_saveround	(0,$key,@T);		# KL<<<0
Packit c4476c
Packit c4476c
	&cmp	($Tbl,128);
Packit c4476c
	&je	(&label("1st128"));
Packit c4476c
Packit c4476c
	&mov	(@T[0],&DWP(16,$idx));		# load 128-191 bits
Packit c4476c
	&mov	(@T[1],&DWP(20,$idx));
Packit c4476c
	&cmp	($Tbl,192);
Packit c4476c
	&je	(&label("1st192"));
Packit c4476c
	&mov	(@T[2],&DWP(24,$idx));		# load 192-255 bits
Packit c4476c
	&mov	(@T[3],&DWP(28,$idx));
Packit c4476c
	&jmp	(&label("1st256"));
Packit c4476c
&set_label("1st192",4);
Packit c4476c
	&mov	(@T[2],@T[0]);
Packit c4476c
	&mov	(@T[3],@T[1]);
Packit c4476c
	&not	(@T[2]);
Packit c4476c
	&not	(@T[3]);
Packit c4476c
&set_label("1st256",4);
Packit c4476c
	&bswap	(@T[0]);
Packit c4476c
	&bswap	(@T[1]);
Packit c4476c
	&bswap	(@T[2]);
Packit c4476c
	&bswap	(@T[3]);
Packit c4476c
Packit c4476c
	&_saveround	(4,$key,@T);		# temporary storage for KR!
Packit c4476c
Packit c4476c
	&xor	(@T[0],&DWP(0*8+0,$key));	# KR^KL
Packit c4476c
	&xor	(@T[1],&DWP(0*8+4,$key));
Packit c4476c
	&xor	(@T[2],&DWP(1*8+0,$key));
Packit c4476c
	&xor	(@T[3],&DWP(1*8+4,$key));
Packit c4476c
Packit c4476c
&set_label("1st128",4);
Packit c4476c
	&call	(&label("pic_point"));
Packit c4476c
	&set_label("pic_point");
Packit c4476c
	&blindpop($Tbl);
Packit c4476c
	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
Packit c4476c
	&lea	($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
Packit c4476c
Packit c4476c
	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[0]
Packit c4476c
	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
Packit c4476c
	&mov	(&swtmp(1),@T[1]);
Packit c4476c
	&mov	(&swtmp(2),@T[2]);
Packit c4476c
	&mov	(&swtmp(3),@T[3]);
Packit c4476c
	&Camellia_Feistel($step++);
Packit c4476c
	&Camellia_Feistel($step++);
Packit c4476c
	&mov	(@T[2],&swtmp(2));
Packit c4476c
	&mov	(@T[3],&swtmp(3));
Packit c4476c
Packit c4476c
	&mov	($idx,&wparam(2));
Packit c4476c
	&xor	(@T[0],&DWP(0*8+0,$idx));	# ^KL
Packit c4476c
	&xor	(@T[1],&DWP(0*8+4,$idx));
Packit c4476c
	&xor	(@T[2],&DWP(1*8+0,$idx));
Packit c4476c
	&xor	(@T[3],&DWP(1*8+4,$idx));
Packit c4476c
Packit c4476c
	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[4]
Packit c4476c
	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
Packit c4476c
	&mov	(&swtmp(1),@T[1]);
Packit c4476c
	&mov	(&swtmp(2),@T[2]);
Packit c4476c
	&mov	(&swtmp(3),@T[3]);
Packit c4476c
	&Camellia_Feistel($step++);
Packit c4476c
	&Camellia_Feistel($step++);
Packit c4476c
	&mov	(@T[2],&swtmp(2));
Packit c4476c
	&mov	(@T[3],&swtmp(3));
Packit c4476c
Packit c4476c
	&mov	($idx,&wparam(0));
Packit c4476c
	&cmp	($idx,128);
Packit c4476c
	&jne	(&label("2nd256"));
Packit c4476c
Packit c4476c
	&mov	($key,&wparam(2));
Packit c4476c
	&lea	($key,&DWP(128,$key));		# size optimization
Packit c4476c
Packit c4476c
	####### process KA
Packit c4476c
	&_saveround	(2,$key,-128,@T);	# KA<<<0
Packit c4476c
	&_rotl128	(@T,15,6,@T);		# KA<<<15
Packit c4476c
	&_rotl128	(@T,15,8,@T);		# KA<<<(15+15=30)
Packit c4476c
	&_rotl128	(@T,15,12,@T[0],@T[1]);	# KA<<<(30+15=45)
Packit c4476c
	&_rotl128	(@T,15,14,@T);		# KA<<<(45+15=60)
Packit c4476c
	push		(@T,shift(@T));		# rotl128(@T,32);
Packit c4476c
	&_rotl128	(@T,2,20,@T);		# KA<<<(60+32+2=94)
Packit c4476c
	&_rotl128	(@T,17,24,@T);		# KA<<<(94+17=111)
Packit c4476c
Packit c4476c
	####### process KL
Packit c4476c
	&_loadround	(0,$key,-128,@T);	# load KL
Packit c4476c
	&_rotl128	(@T,15,4,@T);		# KL<<<15
Packit c4476c
	&_rotl128	(@T,30,10,@T);		# KL<<<(15+30=45)
Packit c4476c
	&_rotl128	(@T,15,13,@T[2],@T[3]);	# KL<<<(45+15=60)
Packit c4476c
	&_rotl128	(@T,17,16,@T);		# KL<<<(60+17=77)
Packit c4476c
	&_rotl128	(@T,17,18,@T);		# KL<<<(77+17=94)
Packit c4476c
	&_rotl128	(@T,17,22,@T);		# KL<<<(94+17=111)
Packit c4476c
Packit c4476c
	while (@T[0] ne "eax")			# restore order
Packit c4476c
	{   unshift	(@T,pop(@T));   }
Packit c4476c
Packit c4476c
	&mov	("eax",3);			# 3 grandRounds
Packit c4476c
	&jmp	(&label("done"));
Packit c4476c
Packit c4476c
&set_label("2nd256",16);
Packit c4476c
	&mov	($idx,&wparam(2));
Packit c4476c
	&_saveround	(6,$idx,@T);		# temporary storage for KA!
Packit c4476c
Packit c4476c
	&xor	(@T[0],&DWP(4*8+0,$idx));	# KA^KR
Packit c4476c
	&xor	(@T[1],&DWP(4*8+4,$idx));
Packit c4476c
	&xor	(@T[2],&DWP(5*8+0,$idx));
Packit c4476c
	&xor	(@T[3],&DWP(5*8+4,$idx));
Packit c4476c
Packit c4476c
	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[8]
Packit c4476c
	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
Packit c4476c
	&mov	(&swtmp(1),@T[1]);
Packit c4476c
	&mov	(&swtmp(2),@T[2]);
Packit c4476c
	&mov	(&swtmp(3),@T[3]);
Packit c4476c
	&Camellia_Feistel($step++);
Packit c4476c
	&Camellia_Feistel($step++);
Packit c4476c
	&mov	(@T[2],&swtmp(2));
Packit c4476c
	&mov	(@T[3],&swtmp(3));
Packit c4476c
Packit c4476c
	&mov	($key,&wparam(2));
Packit c4476c
	&lea	($key,&DWP(128,$key));		# size optimization
Packit c4476c
Packit c4476c
	####### process KB
Packit c4476c
	&_saveround	(2,$key,-128,@T);	# KB<<<0
Packit c4476c
	&_rotl128	(@T,30,10,@T);		# KB<<<30
Packit c4476c
	&_rotl128	(@T,30,20,@T);		# KB<<<(30+30=60)
Packit c4476c
	push		(@T,shift(@T));		# rotl128(@T,32);
Packit c4476c
	&_rotl128	(@T,19,32,@T);		# KB<<<(60+32+19=111)
Packit c4476c
Packit c4476c
	####### process KR
Packit c4476c
	&_loadround	(4,$key,-128,@T);	# load KR
Packit c4476c
	&_rotl128	(@T,15,4,@T);		# KR<<<15
Packit c4476c
	&_rotl128	(@T,15,8,@T);		# KR<<<(15+15=30)
Packit c4476c
	&_rotl128	(@T,30,18,@T);		# KR<<<(30+30=60)
Packit c4476c
	push		(@T,shift(@T));		# rotl128(@T,32);
Packit c4476c
	&_rotl128	(@T,2,26,@T);		# KR<<<(60+32+2=94)
Packit c4476c
Packit c4476c
	####### process KA
Packit c4476c
	&_loadround	(6,$key,-128,@T);	# load KA
Packit c4476c
	&_rotl128	(@T,15,6,@T);		# KA<<<15
Packit c4476c
	&_rotl128	(@T,30,14,@T);		# KA<<<(15+30=45)
Packit c4476c
	push		(@T,shift(@T));		# rotl128(@T,32);
Packit c4476c
	&_rotl128	(@T,0,24,@T);		# KA<<<(45+32+0=77)
Packit c4476c
	&_rotl128	(@T,17,28,@T);		# KA<<<(77+17=94)
Packit c4476c
Packit c4476c
	####### process KL
Packit c4476c
	&_loadround	(0,$key,-128,@T);	# load KL
Packit c4476c
	push		(@T,shift(@T));		# rotl128(@T,32);
Packit c4476c
	&_rotl128	(@T,13,12,@T);		# KL<<<(32+13=45)
Packit c4476c
	&_rotl128	(@T,15,16,@T);		# KL<<<(45+15=60)
Packit c4476c
	&_rotl128	(@T,17,22,@T);		# KL<<<(60+17=77)
Packit c4476c
	push		(@T,shift(@T));		# rotl128(@T,32);
Packit c4476c
	&_rotl128	(@T,2,30,@T);		# KL<<<(77+32+2=111)
Packit c4476c
Packit c4476c
	while (@T[0] ne "eax")			# restore order
Packit c4476c
	{   unshift	(@T,pop(@T));   }
Packit c4476c
Packit c4476c
	&mov	("eax",4);			# 4 grandRounds
Packit c4476c
&set_label("done");
Packit c4476c
	&lea	("edx",&DWP(272-128,$key));	# end of key schedule
Packit c4476c
	&stack_pop(4);
Packit c4476c
}
Packit c4476c
&function_end("Camellia_Ekeygen");
Packit c4476c
Packit c4476c
if ($OPENSSL) {
Packit c4476c
# int Camellia_set_key (
Packit c4476c
#		const unsigned char *userKey,
Packit c4476c
#		int bits,
Packit c4476c
#		CAMELLIA_KEY *key)
Packit c4476c
&function_begin_B("Camellia_set_key");
Packit c4476c
	&push	("ebx");
Packit c4476c
	&mov	("ecx",&wparam(0));	# pull arguments
Packit c4476c
	&mov	("ebx",&wparam(1));
Packit c4476c
	&mov	("edx",&wparam(2));
Packit c4476c
Packit c4476c
	&mov	("eax",-1);
Packit c4476c
	&test	("ecx","ecx");
Packit c4476c
	&jz	(&label("done"));	# userKey==NULL?
Packit c4476c
	&test	("edx","edx");
Packit c4476c
	&jz	(&label("done"));	# key==NULL?
Packit c4476c
Packit c4476c
	&mov	("eax",-2);
Packit c4476c
	&cmp	("ebx",256);
Packit c4476c
	&je	(&label("arg_ok"));	# bits==256?
Packit c4476c
	&cmp	("ebx",192);
Packit c4476c
	&je	(&label("arg_ok"));	# bits==192?
Packit c4476c
	&cmp	("ebx",128);
Packit c4476c
	&jne	(&label("done"));	# bits!=128?
Packit c4476c
&set_label("arg_ok",4);
Packit c4476c
Packit c4476c
	&push	("edx");		# push arguments
Packit c4476c
	&push	("ecx");
Packit c4476c
	&push	("ebx");
Packit c4476c
	&call	("Camellia_Ekeygen");
Packit c4476c
	&stack_pop(3);
Packit c4476c
Packit c4476c
	# eax holds grandRounds and edx points at where to put it
Packit c4476c
	&mov	(&DWP(0,"edx"),"eax");
Packit c4476c
	&xor	("eax","eax");
Packit c4476c
&set_label("done",4);
Packit c4476c
	&pop	("ebx");
Packit c4476c
	&ret	();
Packit c4476c
&function_end_B("Camellia_set_key");
Packit c4476c
}
Packit c4476c
Packit c4476c
@SBOX=(
Packit c4476c
112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
Packit c4476c
 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
Packit c4476c
134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
Packit c4476c
166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
Packit c4476c
139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
Packit c4476c
223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
Packit c4476c
 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
Packit c4476c
254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
Packit c4476c
170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
Packit c4476c
 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
Packit c4476c
135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
Packit c4476c
 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
Packit c4476c
233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
Packit c4476c
120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
Packit c4476c
114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
Packit c4476c
 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
Packit c4476c
Packit c4476c
sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
Packit c4476c
sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
Packit c4476c
sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
Packit c4476c
sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
Packit c4476c
Packit c4476c
&set_label("Camellia_SIGMA",64);
Packit c4476c
&data_word(
Packit c4476c
    0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
Packit c4476c
    0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
Packit c4476c
    0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
Packit c4476c
    0,          0,          0,          0);
Packit c4476c
&set_label("Camellia_SBOX",64);
Packit c4476c
# tables are interleaved, remember?
Packit c4476c
for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
Packit c4476c
for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
Packit c4476c
Packit c4476c
# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
Packit c4476c
#			size_t length, const CAMELLIA_KEY *key,
Packit c4476c
#			unsigned char *ivp,const int enc);
Packit c4476c
{
Packit c4476c
# stack frame layout
Packit c4476c
#             -4(%esp)		# return address	 0(%esp)
Packit c4476c
#              0(%esp)		# s0			 4(%esp)
Packit c4476c
#              4(%esp)		# s1			 8(%esp)
Packit c4476c
#              8(%esp)		# s2			12(%esp)
Packit c4476c
#             12(%esp)		# s3			16(%esp)
Packit c4476c
#             16(%esp)		# end of key schedule	20(%esp)
Packit c4476c
#             20(%esp)		# %esp backup
Packit c4476c
my $_inp=&DWP(24,"esp");	#copy of wparam(0)
Packit c4476c
my $_out=&DWP(28,"esp");	#copy of wparam(1)
Packit c4476c
my $_len=&DWP(32,"esp");	#copy of wparam(2)
Packit c4476c
my $_key=&DWP(36,"esp");	#copy of wparam(3)
Packit c4476c
my $_ivp=&DWP(40,"esp");	#copy of wparam(4)
Packit c4476c
my $ivec=&DWP(44,"esp");	#ivec[16]
Packit c4476c
my $_tmp=&DWP(44,"esp");	#volatile variable [yes, aliases with ivec]
Packit c4476c
my ($s0,$s1,$s2,$s3) = @T;
Packit c4476c
Packit c4476c
&function_begin("Camellia_cbc_encrypt");
Packit c4476c
	&mov	($s2 eq "ecx"? $s2 : "",&wparam(2));	# load len
Packit c4476c
	&cmp	($s2,0);
Packit c4476c
	&je	(&label("enc_out"));
Packit c4476c
Packit c4476c
	&pushf	();
Packit c4476c
	&cld	();
Packit c4476c
Packit c4476c
	&mov	($s0,&wparam(0));	# load inp
Packit c4476c
	&mov	($s1,&wparam(1));	# load out
Packit c4476c
	#&mov	($s2,&wparam(2));	# load len
Packit c4476c
	&mov	($s3,&wparam(3));	# load key
Packit c4476c
	&mov	($Tbl,&wparam(4));	# load ivp
Packit c4476c
Packit c4476c
	# allocate aligned stack frame...
Packit c4476c
	&lea	($idx,&DWP(-64,"esp"));
Packit c4476c
	&and	($idx,-64);
Packit c4476c
Packit c4476c
	# place stack frame just "above mod 1024" the key schedule
Packit c4476c
	# this ensures that cache associativity of 2 suffices
Packit c4476c
	&lea	($key,&DWP(-64-63,$s3));
Packit c4476c
	&sub	($key,$idx);
Packit c4476c
	&neg	($key);
Packit c4476c
	&and	($key,0x3C0);	# modulo 1024, but aligned to cache-line
Packit c4476c
	&sub	($idx,$key);
Packit c4476c
Packit c4476c
	&mov	($key,&wparam(5));	# load enc
Packit c4476c
Packit c4476c
	&exch	("esp",$idx);
Packit c4476c
	&add	("esp",4);		# reserve for return address!
Packit c4476c
	&mov	($_esp,$idx);		# save %esp
Packit c4476c
Packit c4476c
	&mov	($_inp,$s0);		# save copy of inp
Packit c4476c
	&mov	($_out,$s1);		# save copy of out
Packit c4476c
	&mov	($_len,$s2);		# save copy of len
Packit c4476c
	&mov	($_key,$s3);		# save copy of key
Packit c4476c
	&mov	($_ivp,$Tbl);		# save copy of ivp
Packit c4476c
Packit c4476c
	&call   (&label("pic_point"));	# make it PIC!
Packit c4476c
	&set_label("pic_point");
Packit c4476c
	&blindpop($Tbl);
Packit c4476c
	&lea    ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
Packit c4476c
Packit c4476c
	&mov	($idx,32);
Packit c4476c
	&set_label("prefetch_sbox",4);
Packit c4476c
		&mov	($s0,&DWP(0,$Tbl));
Packit c4476c
		&mov	($s1,&DWP(32,$Tbl));
Packit c4476c
		&mov	($s2,&DWP(64,$Tbl));
Packit c4476c
		&mov	($s3,&DWP(96,$Tbl));
Packit c4476c
		&lea	($Tbl,&DWP(128,$Tbl));
Packit c4476c
		&dec	($idx);
Packit c4476c
	&jnz	(&label("prefetch_sbox"));
Packit c4476c
	&mov	($s0,$_key);
Packit c4476c
	&sub	($Tbl,4096);
Packit c4476c
	&mov	($idx,$_inp);
Packit c4476c
	&mov	($s3,&DWP(272,$s0));		# load grandRounds
Packit c4476c
Packit c4476c
	&cmp	($key,0);
Packit c4476c
	&je	(&label("DECRYPT"));
Packit c4476c
Packit c4476c
	&mov	($s2,$_len);
Packit c4476c
	&mov	($key,$_ivp);
Packit c4476c
	&shl	($s3,6);
Packit c4476c
	&lea	($s3,&DWP(0,$s0,$s3));
Packit c4476c
	&mov	($_end,$s3);
Packit c4476c
Packit c4476c
	&test	($s2,0xFFFFFFF0);
Packit c4476c
	&jz	(&label("enc_tail"));		# short input...
Packit c4476c
Packit c4476c
	&mov	($s0,&DWP(0,$key));		# load iv
Packit c4476c
	&mov	($s1,&DWP(4,$key));
Packit c4476c
Packit c4476c
	&set_label("enc_loop",4);
Packit c4476c
		&mov	($s2,&DWP(8,$key));
Packit c4476c
		&mov	($s3,&DWP(12,$key));
Packit c4476c
Packit c4476c
		&xor	($s0,&DWP(0,$idx));	# xor input data
Packit c4476c
		&xor	($s1,&DWP(4,$idx));
Packit c4476c
		&xor	($s2,&DWP(8,$idx));
Packit c4476c
		&bswap	($s0);
Packit c4476c
		&xor	($s3,&DWP(12,$idx));
Packit c4476c
		&bswap	($s1);
Packit c4476c
		&mov	($key,$_key);		# load key
Packit c4476c
		&bswap	($s2);
Packit c4476c
		&bswap	($s3);
Packit c4476c
Packit c4476c
		&call	("_x86_Camellia_encrypt");
Packit c4476c
Packit c4476c
		&mov	($idx,$_inp);		# load inp
Packit c4476c
		&mov	($key,$_out);		# load out
Packit c4476c
Packit c4476c
		&bswap	($s0);
Packit c4476c
		&bswap	($s1);
Packit c4476c
		&bswap	($s2);
Packit c4476c
		&mov	(&DWP(0,$key),$s0);	# save output data
Packit c4476c
		&bswap	($s3);
Packit c4476c
		&mov	(&DWP(4,$key),$s1);
Packit c4476c
		&mov	(&DWP(8,$key),$s2);
Packit c4476c
		&mov	(&DWP(12,$key),$s3);
Packit c4476c
Packit c4476c
		&mov	($s2,$_len);		# load len
Packit c4476c
Packit c4476c
		&lea	($idx,&DWP(16,$idx));
Packit c4476c
		&mov	($_inp,$idx);		# save inp
Packit c4476c
Packit c4476c
		&lea	($s3,&DWP(16,$key));
Packit c4476c
		&mov	($_out,$s3);		# save out
Packit c4476c
Packit c4476c
		&sub	($s2,16);
Packit c4476c
		&test	($s2,0xFFFFFFF0);
Packit c4476c
		&mov	($_len,$s2);		# save len
Packit c4476c
	&jnz	(&label("enc_loop"));
Packit c4476c
	&test	($s2,15);
Packit c4476c
	&jnz	(&label("enc_tail"));
Packit c4476c
	&mov	($idx,$_ivp);		# load ivp
Packit c4476c
	&mov	($s2,&DWP(8,$key));	# restore last dwords
Packit c4476c
	&mov	($s3,&DWP(12,$key));
Packit c4476c
	&mov	(&DWP(0,$idx),$s0);	# save ivec
Packit c4476c
	&mov	(&DWP(4,$idx),$s1);
Packit c4476c
	&mov	(&DWP(8,$idx),$s2);
Packit c4476c
	&mov	(&DWP(12,$idx),$s3);
Packit c4476c
Packit c4476c
	&mov	("esp",$_esp);
Packit c4476c
	&popf	();
Packit c4476c
    &set_label("enc_out");
Packit c4476c
	&function_end_A();
Packit c4476c
	&pushf	();			# kludge, never executed
Packit c4476c
Packit c4476c
    &set_label("enc_tail",4);
Packit c4476c
	&mov	($s0,$key eq "edi" ? $key : "");
Packit c4476c
	&mov	($key,$_out);			# load out
Packit c4476c
	&push	($s0);				# push ivp
Packit c4476c
	&mov	($s1,16);
Packit c4476c
	&sub	($s1,$s2);
Packit c4476c
	&cmp	($key,$idx);			# compare with inp
Packit c4476c
	&je	(&label("enc_in_place"));
Packit c4476c
	&align	(4);
Packit c4476c
	&data_word(0xA4F3F689);	# rep movsb	# copy input
Packit c4476c
	&jmp	(&label("enc_skip_in_place"));
Packit c4476c
    &set_label("enc_in_place");
Packit c4476c
	&lea	($key,&DWP(0,$key,$s2));
Packit c4476c
    &set_label("enc_skip_in_place");
Packit c4476c
	&mov	($s2,$s1);
Packit c4476c
	&xor	($s0,$s0);
Packit c4476c
	&align	(4);
Packit c4476c
	&data_word(0xAAF3F689);	# rep stosb	# zero tail
Packit c4476c
	&pop	($key);				# pop ivp
Packit c4476c
Packit c4476c
	&mov	($idx,$_out);			# output as input
Packit c4476c
	&mov	($s0,&DWP(0,$key));
Packit c4476c
	&mov	($s1,&DWP(4,$key));
Packit c4476c
	&mov	($_len,16);			# len=16
Packit c4476c
	&jmp	(&label("enc_loop"));		# one more spin...
Packit c4476c
Packit c4476c
#----------------------------- DECRYPT -----------------------------#
Packit c4476c
&set_label("DECRYPT",16);
Packit c4476c
	&shl	($s3,6);
Packit c4476c
	&lea	($s3,&DWP(0,$s0,$s3));
Packit c4476c
	&mov	($_end,$s0);
Packit c4476c
	&mov	($_key,$s3);
Packit c4476c
Packit c4476c
	&cmp	($idx,$_out);
Packit c4476c
	&je	(&label("dec_in_place"));	# in-place processing...
Packit c4476c
Packit c4476c
	&mov	($key,$_ivp);			# load ivp
Packit c4476c
	&mov	($_tmp,$key);
Packit c4476c
Packit c4476c
	&set_label("dec_loop",4);
Packit c4476c
		&mov	($s0,&DWP(0,$idx));	# read input
Packit c4476c
		&mov	($s1,&DWP(4,$idx));
Packit c4476c
		&mov	($s2,&DWP(8,$idx));
Packit c4476c
		&bswap	($s0);
Packit c4476c
		&mov	($s3,&DWP(12,$idx));
Packit c4476c
		&bswap	($s1);
Packit c4476c
		&mov	($key,$_key);		# load key
Packit c4476c
		&bswap	($s2);
Packit c4476c
		&bswap	($s3);
Packit c4476c
Packit c4476c
		&call	("_x86_Camellia_decrypt");
Packit c4476c
Packit c4476c
		&mov	($key,$_tmp);		# load ivp
Packit c4476c
		&mov	($idx,$_len);		# load len
Packit c4476c
Packit c4476c
		&bswap	($s0);
Packit c4476c
		&bswap	($s1);
Packit c4476c
		&bswap	($s2);
Packit c4476c
		&xor	($s0,&DWP(0,$key));	# xor iv
Packit c4476c
		&bswap	($s3);
Packit c4476c
		&xor	($s1,&DWP(4,$key));
Packit c4476c
		&xor	($s2,&DWP(8,$key));
Packit c4476c
		&xor	($s3,&DWP(12,$key));
Packit c4476c
Packit c4476c
		&sub	($idx,16);
Packit c4476c
		&jc	(&label("dec_partial"));
Packit c4476c
		&mov	($_len,$idx);		# save len
Packit c4476c
		&mov	($idx,$_inp);		# load inp
Packit c4476c
		&mov	($key,$_out);		# load out
Packit c4476c
Packit c4476c
		&mov	(&DWP(0,$key),$s0);	# write output
Packit c4476c
		&mov	(&DWP(4,$key),$s1);
Packit c4476c
		&mov	(&DWP(8,$key),$s2);
Packit c4476c
		&mov	(&DWP(12,$key),$s3);
Packit c4476c
Packit c4476c
		&mov	($_tmp,$idx);		# save ivp
Packit c4476c
		&lea	($idx,&DWP(16,$idx));
Packit c4476c
		&mov	($_inp,$idx);		# save inp
Packit c4476c
Packit c4476c
		&lea	($key,&DWP(16,$key));
Packit c4476c
		&mov	($_out,$key);		# save out
Packit c4476c
Packit c4476c
	&jnz	(&label("dec_loop"));
Packit c4476c
	&mov	($key,$_tmp);		# load temp ivp
Packit c4476c
    &set_label("dec_end");
Packit c4476c
	&mov	($idx,$_ivp);		# load user ivp
Packit c4476c
	&mov	($s0,&DWP(0,$key));	# load iv
Packit c4476c
	&mov	($s1,&DWP(4,$key));
Packit c4476c
	&mov	($s2,&DWP(8,$key));
Packit c4476c
	&mov	($s3,&DWP(12,$key));
Packit c4476c
	&mov	(&DWP(0,$idx),$s0);	# copy back to user
Packit c4476c
	&mov	(&DWP(4,$idx),$s1);
Packit c4476c
	&mov	(&DWP(8,$idx),$s2);
Packit c4476c
	&mov	(&DWP(12,$idx),$s3);
Packit c4476c
	&jmp	(&label("dec_out"));
Packit c4476c
Packit c4476c
    &set_label("dec_partial",4);
Packit c4476c
	&lea	($key,$ivec);
Packit c4476c
	&mov	(&DWP(0,$key),$s0);	# dump output to stack
Packit c4476c
	&mov	(&DWP(4,$key),$s1);
Packit c4476c
	&mov	(&DWP(8,$key),$s2);
Packit c4476c
	&mov	(&DWP(12,$key),$s3);
Packit c4476c
	&lea	($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
Packit c4476c
	&mov	($idx eq "esi" ? $idx : "",$key);
Packit c4476c
	&mov	($key eq "edi" ? $key : "",$_out);	# load out
Packit c4476c
	&data_word(0xA4F3F689);	# rep movsb		# copy output
Packit c4476c
	&mov	($key,$_inp);				# use inp as temp ivp
Packit c4476c
	&jmp	(&label("dec_end"));
Packit c4476c
Packit c4476c
    &set_label("dec_in_place",4);
Packit c4476c
	&set_label("dec_in_place_loop");
Packit c4476c
		&lea	($key,$ivec);
Packit c4476c
		&mov	($s0,&DWP(0,$idx));	# read input
Packit c4476c
		&mov	($s1,&DWP(4,$idx));
Packit c4476c
		&mov	($s2,&DWP(8,$idx));
Packit c4476c
		&mov	($s3,&DWP(12,$idx));
Packit c4476c
Packit c4476c
		&mov	(&DWP(0,$key),$s0);	# copy to temp
Packit c4476c
		&mov	(&DWP(4,$key),$s1);
Packit c4476c
		&mov	(&DWP(8,$key),$s2);
Packit c4476c
		&bswap	($s0);
Packit c4476c
		&mov	(&DWP(12,$key),$s3);
Packit c4476c
		&bswap	($s1);
Packit c4476c
		&mov	($key,$_key);		# load key
Packit c4476c
		&bswap	($s2);
Packit c4476c
		&bswap	($s3);
Packit c4476c
Packit c4476c
		&call	("_x86_Camellia_decrypt");
Packit c4476c
Packit c4476c
		&mov	($key,$_ivp);		# load ivp
Packit c4476c
		&mov	($idx,$_out);		# load out
Packit c4476c
Packit c4476c
		&bswap	($s0);
Packit c4476c
		&bswap	($s1);
Packit c4476c
		&bswap	($s2);
Packit c4476c
		&xor	($s0,&DWP(0,$key));	# xor iv
Packit c4476c
		&bswap	($s3);
Packit c4476c
		&xor	($s1,&DWP(4,$key));
Packit c4476c
		&xor	($s2,&DWP(8,$key));
Packit c4476c
		&xor	($s3,&DWP(12,$key));
Packit c4476c
Packit c4476c
		&mov	(&DWP(0,$idx),$s0);	# write output
Packit c4476c
		&mov	(&DWP(4,$idx),$s1);
Packit c4476c
		&mov	(&DWP(8,$idx),$s2);
Packit c4476c
		&mov	(&DWP(12,$idx),$s3);
Packit c4476c
Packit c4476c
		&lea	($idx,&DWP(16,$idx));
Packit c4476c
		&mov	($_out,$idx);		# save out
Packit c4476c
Packit c4476c
		&lea	($idx,$ivec);
Packit c4476c
		&mov	($s0,&DWP(0,$idx));	# read temp
Packit c4476c
		&mov	($s1,&DWP(4,$idx));
Packit c4476c
		&mov	($s2,&DWP(8,$idx));
Packit c4476c
		&mov	($s3,&DWP(12,$idx));
Packit c4476c
Packit c4476c
		&mov	(&DWP(0,$key),$s0);	# copy iv
Packit c4476c
		&mov	(&DWP(4,$key),$s1);
Packit c4476c
		&mov	(&DWP(8,$key),$s2);
Packit c4476c
		&mov	(&DWP(12,$key),$s3);
Packit c4476c
Packit c4476c
		&mov	($idx,$_inp);		# load inp
Packit c4476c
Packit c4476c
		&lea	($idx,&DWP(16,$idx));
Packit c4476c
		&mov	($_inp,$idx);		# save inp
Packit c4476c
Packit c4476c
		&mov	($s2,$_len);		# load len
Packit c4476c
		&sub	($s2,16);
Packit c4476c
		&jc	(&label("dec_in_place_partial"));
Packit c4476c
		&mov	($_len,$s2);		# save len
Packit c4476c
	&jnz	(&label("dec_in_place_loop"));
Packit c4476c
	&jmp	(&label("dec_out"));
Packit c4476c
Packit c4476c
    &set_label("dec_in_place_partial",4);
Packit c4476c
	# one can argue if this is actually required...
Packit c4476c
	&mov	($key eq "edi" ? $key : "",$_out);
Packit c4476c
	&lea	($idx eq "esi" ? $idx : "",$ivec);
Packit c4476c
	&lea	($key,&DWP(0,$key,$s2));
Packit c4476c
	&lea	($idx,&DWP(16,$idx,$s2));
Packit c4476c
	&neg	($s2 eq "ecx" ? $s2 : "");
Packit c4476c
	&data_word(0xA4F3F689);	# rep movsb	# restore tail
Packit c4476c
Packit c4476c
    &set_label("dec_out",4);
Packit c4476c
    &mov	("esp",$_esp);
Packit c4476c
    &popf	();
Packit c4476c
&function_end("Camellia_cbc_encrypt");
Packit c4476c
}
Packit c4476c
Packit c4476c
&asciz("Camellia for x86 by <appro\@openssl.org>");
Packit c4476c
Packit c4476c
&asm_finish();
Packit c4476c
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";