Blame crypto/rc4/asm/rc4-md5-x86_64.pl

Packit c4476c
#! /usr/bin/env perl
Packit c4476c
# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
Packit c4476c
#
Packit c4476c
# Licensed under the OpenSSL license (the "License").  You may not use
Packit c4476c
# this file except in compliance with the License.  You can obtain a copy
Packit c4476c
# in the file LICENSE in the source distribution or at
Packit c4476c
# https://www.openssl.org/source/license.html
Packit c4476c
Packit c4476c
#
Packit c4476c
# ====================================================================
Packit c4476c
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
Packit c4476c
# project. The module is, however, dual licensed under OpenSSL and
Packit c4476c
# CRYPTOGAMS licenses depending on where you obtain it. For further
Packit c4476c
# details see http://www.openssl.org/~appro/cryptogams/.
Packit c4476c
# ====================================================================
Packit c4476c
Packit c4476c
# June 2011
Packit c4476c
#
Packit c4476c
# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
Packit c4476c
# http://download.intel.com/design/intarch/papers/323686.pdf, is that
Packit c4476c
# since both algorithms exhibit instruction-level parallelism, ILP,
Packit c4476c
# below theoretical maximum, interleaving them would allow to utilize
Packit c4476c
# processor resources better and achieve better performance. RC4
Packit c4476c
# instruction sequence is virtually identical to rc4-x86_64.pl, which
Packit c4476c
# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
Packit c4476c
# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
Packit c4476c
# minimize register usage, which was used as "main thread" with RC4
Packit c4476c
# weaved into it, one RC4 round per one MD5 round. In addition to the
Packit c4476c
# stiched subroutine the script can generate standalone replacement
Packit c4476c
# md5_block_asm_data_order and RC4. Below are performance numbers in
Packit c4476c
# cycles per processed byte, less is better, for these the standalone
Packit c4476c
# subroutines, sum of them, and stitched one:
Packit c4476c
#
Packit c4476c
#		RC4	MD5	RC4+MD5	stitch	gain
Packit c4476c
# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
Packit c4476c
# Core2		6.5	5.8	12.3	7.7	+60%
Packit c4476c
# Westmere	4.3	5.2	9.5	7.0	+36%
Packit c4476c
# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
Packit c4476c
# Ivy Bridge	4.1	5.2	9.3	6.0	+54%
Packit c4476c
# Haswell	4.0	5.0	9.0	5.7	+60%
Packit c4476c
# Skylake	6.3(**)	5.0	11.3	5.3	+110%
Packit c4476c
# Atom		9.3	6.5	15.8	11.1	+42%
Packit c4476c
# VIA Nano	6.3	5.4	11.7	8.6	+37%
Packit c4476c
# Bulldozer	4.5	5.4	9.9	7.7	+29%
Packit c4476c
#
Packit c4476c
# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
Packit c4476c
#	is +53%...
Packit c4476c
# (**)	unidentified anomaly;
Packit c4476c
Packit c4476c
my ($rc4,$md5)=(1,1);	# what to generate?
Packit c4476c
my $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
Packit c4476c
			# but its result is discarded. Idea here is
Packit c4476c
			# to be able to use 'openssl speed rc4' for
Packit c4476c
			# benchmarking the stitched subroutine...
Packit c4476c
Packit c4476c
my $flavour = shift;
Packit c4476c
my $output  = shift;
Packit c4476c
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
Packit c4476c
Packit c4476c
my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
Packit c4476c
Packit c4476c
$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
Packit c4476c
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Packit c4476c
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Packit c4476c
die "can't locate x86_64-xlate.pl";
Packit c4476c
Packit c4476c
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Packit c4476c
*STDOUT=*OUT;
Packit c4476c
Packit c4476c
my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
Packit c4476c
Packit c4476c
if ($rc4 && !$md5) {
Packit c4476c
  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
Packit c4476c
  $func="RC4";				$nargs=4;
Packit c4476c
} elsif ($md5 && !$rc4) {
Packit c4476c
  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
Packit c4476c
  $func="md5_block_asm_data_order";	$nargs=3;
Packit c4476c
} else {
Packit c4476c
  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
Packit c4476c
  $func="rc4_md5_enc";			$nargs=6;
Packit c4476c
  # void rc4_md5_enc(
Packit c4476c
  #		RC4_KEY *key,		#
Packit c4476c
  #		const void *in0,	# RC4 input
Packit c4476c
  #		void *out,		# RC4 output
Packit c4476c
  #		MD5_CTX *ctx,		#
Packit c4476c
  #		const void *inp,	# MD5 input
Packit c4476c
  #		size_t len);		# number of 64-byte blocks
Packit c4476c
}
Packit c4476c
Packit c4476c
my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
Packit c4476c
	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
Packit c4476c
	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
Packit c4476c
	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
Packit c4476c
Packit c4476c
	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
Packit c4476c
	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
Packit c4476c
	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
Packit c4476c
	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
Packit c4476c
Packit c4476c
	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
Packit c4476c
	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
Packit c4476c
	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
Packit c4476c
	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
Packit c4476c
Packit c4476c
	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
Packit c4476c
	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
Packit c4476c
	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
Packit c4476c
	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
Packit c4476c
Packit c4476c
my @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
Packit c4476c
my $tmp="%r12d";
Packit c4476c
Packit c4476c
my @XX=("%rbp","%rsi");			# RC4 registers
Packit c4476c
my @TX=("%rax","%rbx");
Packit c4476c
my $YY="%rcx";
Packit c4476c
my $TY="%rdx";
Packit c4476c
Packit c4476c
my $MOD=32;				# 16, 32 or 64
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.text
Packit c4476c
.align 16
Packit c4476c
Packit c4476c
.globl	$func
Packit c4476c
.type	$func,\@function,$nargs
Packit c4476c
$func:
Packit c4476c
.cfi_startproc
Packit c4476c
	cmp	\$0,$len
Packit c4476c
	je	.Labort
Packit c4476c
	push	%rbx
Packit c4476c
.cfi_push	%rbx
Packit c4476c
	push	%rbp
Packit c4476c
.cfi_push	%rbp
Packit c4476c
	push	%r12
Packit c4476c
.cfi_push	%r12
Packit c4476c
	push	%r13
Packit c4476c
.cfi_push	%r13
Packit c4476c
	push	%r14
Packit c4476c
.cfi_push	%r14
Packit c4476c
	push	%r15
Packit c4476c
.cfi_push	%r15
Packit c4476c
	sub	\$40,%rsp
Packit c4476c
.cfi_adjust_cfa_offset	40
Packit c4476c
.Lbody:
Packit c4476c
___
Packit c4476c
if ($rc4) {
Packit c4476c
$code.=<<___;
Packit c4476c
$D#md5#	mov	$ctx,%r11		# reassign arguments
Packit c4476c
	mov	$len,%r12
Packit c4476c
	mov	$in0,%r13
Packit c4476c
	mov	$out,%r14
Packit c4476c
$D#md5#	mov	$inp,%r15
Packit c4476c
___
Packit c4476c
    $ctx="%r11"	if ($md5);		# reassign arguments
Packit c4476c
    $len="%r12";
Packit c4476c
    $in0="%r13";
Packit c4476c
    $out="%r14";
Packit c4476c
    $inp="%r15"	if ($md5);
Packit c4476c
    $inp=$in0	if (!$md5);
Packit c4476c
$code.=<<___;
Packit c4476c
	xor	$XX[0],$XX[0]
Packit c4476c
	xor	$YY,$YY
Packit c4476c
Packit c4476c
	lea	8($dat),$dat
Packit c4476c
	mov	-8($dat),$XX[0]#b
Packit c4476c
	mov	-4($dat),$YY#b
Packit c4476c
Packit c4476c
	inc	$XX[0]#b
Packit c4476c
	sub	$in0,$out
Packit c4476c
	movl	($dat,$XX[0],4),$TX[0]#d
Packit c4476c
___
Packit c4476c
$code.=<<___ if (!$md5);
Packit c4476c
	xor	$TX[1],$TX[1]
Packit c4476c
	test	\$-128,$len
Packit c4476c
	jz	.Loop1
Packit c4476c
	sub	$XX[0],$TX[1]
Packit c4476c
	and	\$`$MOD-1`,$TX[1]
Packit c4476c
	jz	.Loop${MOD}_is_hot
Packit c4476c
	sub	$TX[1],$len
Packit c4476c
.Loop${MOD}_warmup:
Packit c4476c
	add	$TX[0]#b,$YY#b
Packit c4476c
	movl	($dat,$YY,4),$TY#d
Packit c4476c
	movl	$TX[0]#d,($dat,$YY,4)
Packit c4476c
	movl	$TY#d,($dat,$XX[0],4)
Packit c4476c
	add	$TY#b,$TX[0]#b
Packit c4476c
	inc	$XX[0]#b
Packit c4476c
	movl	($dat,$TX[0],4),$TY#d
Packit c4476c
	movl	($dat,$XX[0],4),$TX[0]#d
Packit c4476c
	xorb	($in0),$TY#b
Packit c4476c
	movb	$TY#b,($out,$in0)
Packit c4476c
	lea	1($in0),$in0
Packit c4476c
	dec	$TX[1]
Packit c4476c
	jnz	.Loop${MOD}_warmup
Packit c4476c
Packit c4476c
	mov	$YY,$TX[1]
Packit c4476c
	xor	$YY,$YY
Packit c4476c
	mov	$TX[1]#b,$YY#b
Packit c4476c
Packit c4476c
.Loop${MOD}_is_hot:
Packit c4476c
	mov	$len,32(%rsp)		# save original $len
Packit c4476c
	shr	\$6,$len		# number of 64-byte blocks
Packit c4476c
___
Packit c4476c
  if ($D && !$md5) {			# stitch in dummy MD5
Packit c4476c
    $md5=1;
Packit c4476c
    $ctx="%r11";
Packit c4476c
    $inp="%r15";
Packit c4476c
    $code.=<<___;
Packit c4476c
	mov	%rsp,$ctx
Packit c4476c
	mov	$in0,$inp
Packit c4476c
___
Packit c4476c
  }
Packit c4476c
}
Packit c4476c
$code.=<<___;
Packit c4476c
#rc4#	add	$TX[0]#b,$YY#b
Packit c4476c
#rc4#	lea	($dat,$XX[0],4),$XX[1]
Packit c4476c
	shl	\$6,$len
Packit c4476c
	add	$inp,$len		# pointer to the end of input
Packit c4476c
	mov	$len,16(%rsp)
Packit c4476c
Packit c4476c
#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
Packit c4476c
#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
Packit c4476c
#md5#	mov	1*4($ctx),$V[1]
Packit c4476c
#md5#	mov	2*4($ctx),$V[2]
Packit c4476c
#md5#	mov	3*4($ctx),$V[3]
Packit c4476c
	jmp	.Loop
Packit c4476c
Packit c4476c
.align	16
Packit c4476c
.Loop:
Packit c4476c
#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
Packit c4476c
#md5#	mov	$V[1],1*4(%rsp)
Packit c4476c
#md5#	mov	$V[2],2*4(%rsp)
Packit c4476c
#md5#	mov	$V[3],$tmp		# forward reference
Packit c4476c
#md5#	mov	$V[3],3*4(%rsp)
Packit c4476c
___
Packit c4476c
Packit c4476c
sub R0 {
Packit c4476c
  my ($i,$a,$b,$c,$d)=@_;
Packit c4476c
  my @rot0=(7,12,17,22);
Packit c4476c
  my $j=$i%16;
Packit c4476c
  my $k=$i%$MOD;
Packit c4476c
  my $xmm="%xmm".($j&1;;
Packit c4476c
    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
Packit c4476c
    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
Packit c4476c
    $code.=<<___;
Packit c4476c
#rc4#	movl	($dat,$YY,4),$TY#d
Packit c4476c
#md5#	xor	$c,$tmp
Packit c4476c
#rc4#	movl	$TX[0]#d,($dat,$YY,4)
Packit c4476c
#md5#	and	$b,$tmp
Packit c4476c
#md5#	add	4*`$j`($inp),$a
Packit c4476c
#rc4#	add	$TY#b,$TX[0]#b
Packit c4476c
#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
Packit c4476c
#md5#	add	\$$K[$i],$a
Packit c4476c
#md5#	xor	$d,$tmp
Packit c4476c
#rc4#	movz	$TX[0]#b,$TX[0]#d
Packit c4476c
#rc4#	movl	$TY#d,4*$k($XX[1])
Packit c4476c
#md5#	add	$tmp,$a
Packit c4476c
#rc4#	add	$TX[1]#b,$YY#b
Packit c4476c
#md5#	rol	\$$rot0[$j%4],$a
Packit c4476c
#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
Packit c4476c
#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
Packit c4476c
#md5#	add	$b,$a
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
	mov	$YY,$XX[1]
Packit c4476c
	xor	$YY,$YY				# keyword to partial register
Packit c4476c
	mov	$XX[1]#b,$YY#b
Packit c4476c
	lea	($dat,$XX[0],4),$XX[1]
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15);
Packit c4476c
	psllq	\$8,%xmm1
Packit c4476c
	pxor	%xmm0,%xmm2
Packit c4476c
	pxor	%xmm1,%xmm2
Packit c4476c
___
Packit c4476c
}
Packit c4476c
sub R1 {
Packit c4476c
  my ($i,$a,$b,$c,$d)=@_;
Packit c4476c
  my @rot1=(5,9,14,20);
Packit c4476c
  my $j=$i%16;
Packit c4476c
  my $k=$i%$MOD;
Packit c4476c
  my $xmm="%xmm".($j&1;;
Packit c4476c
    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
Packit c4476c
    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
Packit c4476c
    $code.=<<___;
Packit c4476c
#rc4#	movl	($dat,$YY,4),$TY#d
Packit c4476c
#md5#	xor	$b,$tmp
Packit c4476c
#rc4#	movl	$TX[0]#d,($dat,$YY,4)
Packit c4476c
#md5#	and	$d,$tmp
Packit c4476c
#md5#	add	4*`((1+5*$j)%16)`($inp),$a
Packit c4476c
#rc4#	add	$TY#b,$TX[0]#b
Packit c4476c
#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
Packit c4476c
#md5#	add	\$$K[$i],$a
Packit c4476c
#md5#	xor	$c,$tmp
Packit c4476c
#rc4#	movz	$TX[0]#b,$TX[0]#d
Packit c4476c
#rc4#	movl	$TY#d,4*$k($XX[1])
Packit c4476c
#md5#	add	$tmp,$a
Packit c4476c
#rc4#	add	$TX[1]#b,$YY#b
Packit c4476c
#md5#	rol	\$$rot1[$j%4],$a
Packit c4476c
#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
Packit c4476c
#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
Packit c4476c
#md5#	add	$b,$a
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
	mov	$YY,$XX[1]
Packit c4476c
	xor	$YY,$YY				# keyword to partial register
Packit c4476c
	mov	$XX[1]#b,$YY#b
Packit c4476c
	lea	($dat,$XX[0],4),$XX[1]
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15);
Packit c4476c
	psllq	\$8,%xmm1
Packit c4476c
	pxor	%xmm0,%xmm3
Packit c4476c
	pxor	%xmm1,%xmm3
Packit c4476c
___
Packit c4476c
}
Packit c4476c
sub R2 {
Packit c4476c
  my ($i,$a,$b,$c,$d)=@_;
Packit c4476c
  my @rot2=(4,11,16,23);
Packit c4476c
  my $j=$i%16;
Packit c4476c
  my $k=$i%$MOD;
Packit c4476c
  my $xmm="%xmm".($j&1;;
Packit c4476c
    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
Packit c4476c
    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
Packit c4476c
    $code.=<<___;
Packit c4476c
#rc4#	movl	($dat,$YY,4),$TY#d
Packit c4476c
#md5#	xor	$c,$tmp
Packit c4476c
#rc4#	movl	$TX[0]#d,($dat,$YY,4)
Packit c4476c
#md5#	xor	$b,$tmp
Packit c4476c
#md5#	add	4*`((5+3*$j)%16)`($inp),$a
Packit c4476c
#rc4#	add	$TY#b,$TX[0]#b
Packit c4476c
#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
Packit c4476c
#md5#	add	\$$K[$i],$a
Packit c4476c
#rc4#	movz	$TX[0]#b,$TX[0]#d
Packit c4476c
#md5#	add	$tmp,$a
Packit c4476c
#rc4#	movl	$TY#d,4*$k($XX[1])
Packit c4476c
#rc4#	add	$TX[1]#b,$YY#b
Packit c4476c
#md5#	rol	\$$rot2[$j%4],$a
Packit c4476c
#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
Packit c4476c
#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
Packit c4476c
#md5#	add	$b,$a
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
	mov	$YY,$XX[1]
Packit c4476c
	xor	$YY,$YY				# keyword to partial register
Packit c4476c
	mov	$XX[1]#b,$YY#b
Packit c4476c
	lea	($dat,$XX[0],4),$XX[1]
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15);
Packit c4476c
	psllq	\$8,%xmm1
Packit c4476c
	pxor	%xmm0,%xmm4
Packit c4476c
	pxor	%xmm1,%xmm4
Packit c4476c
___
Packit c4476c
}
Packit c4476c
sub R3 {
Packit c4476c
  my ($i,$a,$b,$c,$d)=@_;
Packit c4476c
  my @rot3=(6,10,15,21);
Packit c4476c
  my $j=$i%16;
Packit c4476c
  my $k=$i%$MOD;
Packit c4476c
  my $xmm="%xmm".($j&1;;
Packit c4476c
    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
Packit c4476c
    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
Packit c4476c
    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
Packit c4476c
    $code.=<<___;
Packit c4476c
#rc4#	movl	($dat,$YY,4),$TY#d
Packit c4476c
#md5#	xor	$d,$tmp
Packit c4476c
#rc4#	movl	$TX[0]#d,($dat,$YY,4)
Packit c4476c
#md5#	or	$b,$tmp
Packit c4476c
#md5#	add	4*`((7*$j)%16)`($inp),$a
Packit c4476c
#rc4#	add	$TY#b,$TX[0]#b
Packit c4476c
#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
Packit c4476c
#md5#	add	\$$K[$i],$a
Packit c4476c
#rc4#	movz	$TX[0]#b,$TX[0]#d
Packit c4476c
#md5#	xor	$c,$tmp
Packit c4476c
#rc4#	movl	$TY#d,4*$k($XX[1])
Packit c4476c
#md5#	add	$tmp,$a
Packit c4476c
#rc4#	add	$TX[1]#b,$YY#b
Packit c4476c
#md5#	rol	\$$rot3[$j%4],$a
Packit c4476c
#md5#	mov	\$-1,$tmp			# forward reference
Packit c4476c
#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
Packit c4476c
#md5#	add	$b,$a
Packit c4476c
___
Packit c4476c
    $code.=<<___ if ($rc4 && $j==15);
Packit c4476c
	mov	$XX[0],$XX[1]
Packit c4476c
	xor	$XX[0],$XX[0]			# keyword to partial register
Packit c4476c
	mov	$XX[1]#b,$XX[0]#b
Packit c4476c
	mov	$YY,$XX[1]
Packit c4476c
	xor	$YY,$YY				# keyword to partial register
Packit c4476c
	mov	$XX[1]#b,$YY#b
Packit c4476c
	lea	($dat,$XX[0],4),$XX[1]
Packit c4476c
	psllq	\$8,%xmm1
Packit c4476c
	pxor	%xmm0,%xmm5
Packit c4476c
	pxor	%xmm1,%xmm5
Packit c4476c
___
Packit c4476c
}
Packit c4476c
Packit c4476c
my $i=0;
Packit c4476c
for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
Packit c4476c
for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
Packit c4476c
for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
Packit c4476c
for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
Packit c4476c
#md5#	add	1*4(%rsp),$V[1]
Packit c4476c
#md5#	add	2*4(%rsp),$V[2]
Packit c4476c
#md5#	add	3*4(%rsp),$V[3]
Packit c4476c
Packit c4476c
#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
Packit c4476c
#rc4#	movdqu	%xmm3,16($out,$in0)
Packit c4476c
#rc4#	movdqu	%xmm4,32($out,$in0)
Packit c4476c
#rc4#	movdqu	%xmm5,48($out,$in0)
Packit c4476c
#md5#	lea	64($inp),$inp
Packit c4476c
#rc4#	lea	64($in0),$in0
Packit c4476c
	cmp	16(%rsp),$inp		# are we done?
Packit c4476c
	jb	.Loop
Packit c4476c
Packit c4476c
#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
Packit c4476c
#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
Packit c4476c
#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
Packit c4476c
#md5#	mov	$V[1],1*4($len)
Packit c4476c
#md5#	mov	$V[2],2*4($len)
Packit c4476c
#md5#	mov	$V[3],3*4($len)
Packit c4476c
___
Packit c4476c
$code.=<<___ if ($rc4 && (!$md5 || $D));
Packit c4476c
	mov	32(%rsp),$len		# restore original $len
Packit c4476c
	and	\$63,$len		# remaining bytes
Packit c4476c
	jnz	.Loop1
Packit c4476c
	jmp	.Ldone
Packit c4476c
Packit c4476c
.align	16
Packit c4476c
.Loop1:
Packit c4476c
	add	$TX[0]#b,$YY#b
Packit c4476c
	movl	($dat,$YY,4),$TY#d
Packit c4476c
	movl	$TX[0]#d,($dat,$YY,4)
Packit c4476c
	movl	$TY#d,($dat,$XX[0],4)
Packit c4476c
	add	$TY#b,$TX[0]#b
Packit c4476c
	inc	$XX[0]#b
Packit c4476c
	movl	($dat,$TX[0],4),$TY#d
Packit c4476c
	movl	($dat,$XX[0],4),$TX[0]#d
Packit c4476c
	xorb	($in0),$TY#b
Packit c4476c
	movb	$TY#b,($out,$in0)
Packit c4476c
	lea	1($in0),$in0
Packit c4476c
	dec	$len
Packit c4476c
	jnz	.Loop1
Packit c4476c
Packit c4476c
.Ldone:
Packit c4476c
___
Packit c4476c
$code.=<<___;
Packit c4476c
#rc4#	sub	\$1,$XX[0]#b
Packit c4476c
#rc4#	movl	$XX[0]#d,-8($dat)
Packit c4476c
#rc4#	movl	$YY#d,-4($dat)
Packit c4476c
Packit c4476c
	mov	40(%rsp),%r15
Packit c4476c
.cfi_restore	%r15
Packit c4476c
	mov	48(%rsp),%r14
Packit c4476c
.cfi_restore	%r14
Packit c4476c
	mov	56(%rsp),%r13
Packit c4476c
.cfi_restore	%r13
Packit c4476c
	mov	64(%rsp),%r12
Packit c4476c
.cfi_restore	%r12
Packit c4476c
	mov	72(%rsp),%rbp
Packit c4476c
.cfi_restore	%rbp
Packit c4476c
	mov	80(%rsp),%rbx
Packit c4476c
.cfi_restore	%rbx
Packit c4476c
	lea	88(%rsp),%rsp
Packit c4476c
.cfi_adjust_cfa_offset	-88
Packit c4476c
.Lepilogue:
Packit c4476c
.Labort:
Packit c4476c
	ret
Packit c4476c
.cfi_endproc
Packit c4476c
.size $func,.-$func
Packit c4476c
___
Packit c4476c
Packit c4476c
if ($rc4 && $D) {	# sole purpose of this section is to provide
Packit c4476c
			# option to use the generated module as drop-in
Packit c4476c
			# replacement for rc4-x86_64.pl for debugging
Packit c4476c
			# and testing purposes...
Packit c4476c
my ($idx,$ido)=("%r8","%r9");
Packit c4476c
my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.globl	RC4_set_key
Packit c4476c
.type	RC4_set_key,\@function,3
Packit c4476c
.align	16
Packit c4476c
RC4_set_key:
Packit c4476c
.cfi_startproc
Packit c4476c
	lea	8($dat),$dat
Packit c4476c
	lea	($inp,$len),$inp
Packit c4476c
	neg	$len
Packit c4476c
	mov	$len,%rcx
Packit c4476c
	xor	%eax,%eax
Packit c4476c
	xor	$ido,$ido
Packit c4476c
	xor	%r10,%r10
Packit c4476c
	xor	%r11,%r11
Packit c4476c
	jmp	.Lw1stloop
Packit c4476c
Packit c4476c
.align	16
Packit c4476c
.Lw1stloop:
Packit c4476c
	mov	%eax,($dat,%rax,4)
Packit c4476c
	add	\$1,%al
Packit c4476c
	jnc	.Lw1stloop
Packit c4476c
Packit c4476c
	xor	$ido,$ido
Packit c4476c
	xor	$idx,$idx
Packit c4476c
.align	16
Packit c4476c
.Lw2ndloop:
Packit c4476c
	mov	($dat,$ido,4),%r10d
Packit c4476c
	add	($inp,$len,1),$idx#b
Packit c4476c
	add	%r10b,$idx#b
Packit c4476c
	add	\$1,$len
Packit c4476c
	mov	($dat,$idx,4),%r11d
Packit c4476c
	cmovz	%rcx,$len
Packit c4476c
	mov	%r10d,($dat,$idx,4)
Packit c4476c
	mov	%r11d,($dat,$ido,4)
Packit c4476c
	add	\$1,$ido#b
Packit c4476c
	jnc	.Lw2ndloop
Packit c4476c
Packit c4476c
	xor	%eax,%eax
Packit c4476c
	mov	%eax,-8($dat)
Packit c4476c
	mov	%eax,-4($dat)
Packit c4476c
	ret
Packit c4476c
.cfi_endproc
Packit c4476c
.size	RC4_set_key,.-RC4_set_key
Packit c4476c
Packit c4476c
.globl	RC4_options
Packit c4476c
.type	RC4_options,\@abi-omnipotent
Packit c4476c
.align	16
Packit c4476c
RC4_options:
Packit c4476c
	lea	.Lopts(%rip),%rax
Packit c4476c
	ret
Packit c4476c
.align	64
Packit c4476c
.Lopts:
Packit c4476c
.asciz	"rc4(64x,int)"
Packit c4476c
.align	64
Packit c4476c
.size	RC4_options,.-RC4_options
Packit c4476c
___
Packit c4476c
}
Packit c4476c
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
Packit c4476c
#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
Packit c4476c
if ($win64) {
Packit c4476c
my $rec="%rcx";
Packit c4476c
my $frame="%rdx";
Packit c4476c
my $context="%r8";
Packit c4476c
my $disp="%r9";
Packit c4476c
Packit c4476c
$code.=<<___;
Packit c4476c
.extern	__imp_RtlVirtualUnwind
Packit c4476c
.type	se_handler,\@abi-omnipotent
Packit c4476c
.align	16
Packit c4476c
se_handler:
Packit c4476c
	push	%rsi
Packit c4476c
	push	%rdi
Packit c4476c
	push	%rbx
Packit c4476c
	push	%rbp
Packit c4476c
	push	%r12
Packit c4476c
	push	%r13
Packit c4476c
	push	%r14
Packit c4476c
	push	%r15
Packit c4476c
	pushfq
Packit c4476c
	sub	\$64,%rsp
Packit c4476c
Packit c4476c
	mov	120($context),%rax	# pull context->Rax
Packit c4476c
	mov	248($context),%rbx	# pull context->Rip
Packit c4476c
Packit c4476c
	lea	.Lbody(%rip),%r10
Packit c4476c
	cmp	%r10,%rbx		# context->Rip<.Lbody
Packit c4476c
	jb	.Lin_prologue
Packit c4476c
Packit c4476c
	mov	152($context),%rax	# pull context->Rsp
Packit c4476c
Packit c4476c
	lea	.Lepilogue(%rip),%r10
Packit c4476c
	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
Packit c4476c
	jae	.Lin_prologue
Packit c4476c
Packit c4476c
	mov	40(%rax),%r15
Packit c4476c
	mov	48(%rax),%r14
Packit c4476c
	mov	56(%rax),%r13
Packit c4476c
	mov	64(%rax),%r12
Packit c4476c
	mov	72(%rax),%rbp
Packit c4476c
	mov	80(%rax),%rbx
Packit c4476c
	lea	88(%rax),%rax
Packit c4476c
Packit c4476c
	mov	%rbx,144($context)	# restore context->Rbx
Packit c4476c
	mov	%rbp,160($context)	# restore context->Rbp
Packit c4476c
	mov	%r12,216($context)	# restore context->R12
Packit c4476c
	mov	%r13,224($context)	# restore context->R12
Packit c4476c
	mov	%r14,232($context)	# restore context->R14
Packit c4476c
	mov	%r15,240($context)	# restore context->R15
Packit c4476c
Packit c4476c
.Lin_prologue:
Packit c4476c
	mov	8(%rax),%rdi
Packit c4476c
	mov	16(%rax),%rsi
Packit c4476c
	mov	%rax,152($context)	# restore context->Rsp
Packit c4476c
	mov	%rsi,168($context)	# restore context->Rsi
Packit c4476c
	mov	%rdi,176($context)	# restore context->Rdi
Packit c4476c
Packit c4476c
	mov	40($disp),%rdi		# disp->ContextRecord
Packit c4476c
	mov	$context,%rsi		# context
Packit c4476c
	mov	\$154,%ecx		# sizeof(CONTEXT)
Packit c4476c
	.long	0xa548f3fc		# cld; rep movsq
Packit c4476c
Packit c4476c
	mov	$disp,%rsi
Packit c4476c
	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
Packit c4476c
	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
Packit c4476c
	mov	0(%rsi),%r8		# arg3, disp->ControlPc
Packit c4476c
	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
Packit c4476c
	mov	40(%rsi),%r10		# disp->ContextRecord
Packit c4476c
	lea	56(%rsi),%r11		# &disp->HandlerData
Packit c4476c
	lea	24(%rsi),%r12		# &disp->EstablisherFrame
Packit c4476c
	mov	%r10,32(%rsp)		# arg5
Packit c4476c
	mov	%r11,40(%rsp)		# arg6
Packit c4476c
	mov	%r12,48(%rsp)		# arg7
Packit c4476c
	mov	%rcx,56(%rsp)		# arg8, (NULL)
Packit c4476c
	call	*__imp_RtlVirtualUnwind(%rip)
Packit c4476c
Packit c4476c
	mov	\$1,%eax		# ExceptionContinueSearch
Packit c4476c
	add	\$64,%rsp
Packit c4476c
	popfq
Packit c4476c
	pop	%r15
Packit c4476c
	pop	%r14
Packit c4476c
	pop	%r13
Packit c4476c
	pop	%r12
Packit c4476c
	pop	%rbp
Packit c4476c
	pop	%rbx
Packit c4476c
	pop	%rdi
Packit c4476c
	pop	%rsi
Packit c4476c
	ret
Packit c4476c
.size	se_handler,.-se_handler
Packit c4476c
Packit c4476c
.section	.pdata
Packit c4476c
.align	4
Packit c4476c
	.rva	.LSEH_begin_$func
Packit c4476c
	.rva	.LSEH_end_$func
Packit c4476c
	.rva	.LSEH_info_$func
Packit c4476c
Packit c4476c
.section	.xdata
Packit c4476c
.align	8
Packit c4476c
.LSEH_info_$func:
Packit c4476c
	.byte	9,0,0,0
Packit c4476c
	.rva	se_handler
Packit c4476c
___
Packit c4476c
}
Packit c4476c
Packit c4476c
sub reg_part {
Packit c4476c
my ($reg,$conv)=@_;
Packit c4476c
    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
Packit c4476c
    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
Packit c4476c
    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
Packit c4476c
    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
Packit c4476c
    return $reg;
Packit c4476c
}
Packit c4476c
Packit c4476c
$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
Packit c4476c
$code =~ s/\`([^\`]*)\`/eval $1/gem;
Packit c4476c
$code =~ s/pinsrw\s+\$0,/movd	/gm;
Packit c4476c
Packit c4476c
$code =~ s/#md5#//gm	if ($md5);
Packit c4476c
$code =~ s/#rc4#//gm	if ($rc4);
Packit c4476c
Packit c4476c
print $code;
Packit c4476c
Packit c4476c
close STDOUT or die "error closing STDOUT: $!";