Blob Blame History Raw
/* poly1305-sse2-amd64.S  -  AMD64/SSE2 implementation of Poly1305
 *
 * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This file is part of Libgcrypt.
 *
 * Libgcrypt is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * Libgcrypt is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

/*
 * Based on public domain implementation by Andrew Moon at
 *  https://github.com/floodyberry/poly1305-opt
 */

#include <config.h>

#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))

#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
# define ELF(...) __VA_ARGS__
#else
# define ELF(...) /*_*/
#endif


.text


.align 8
.globl _gcry_poly1305_amd64_sse2_init_ext
ELF(.type  _gcry_poly1305_amd64_sse2_init_ext,@function;)
_gcry_poly1305_amd64_sse2_init_ext:
.Lpoly1305_init_ext_x86_local:
	xor %edx, %edx
	pushq %r12
	pushq %r13
	pushq %r14
	movq %rdx, %r10
	movq $-1, %rcx
	testq %r10, %r10
	pxor %xmm0, %xmm0
	movq $0xfffffc0ffff, %r9
	movdqa %xmm0, (%rdi)
	cmove %rcx, %r10
	movdqa %xmm0, 16(%rdi)
	movq $0xffc0fffffff, %rcx
	movdqa %xmm0, 32(%rdi)
	movdqa %xmm0, 48(%rdi)
	movdqa %xmm0, 64(%rdi)
	movq 8(%rsi), %r11
	movq %r11, %r8
	movq (%rsi), %r12
	andq %r12, %rcx
	shrq $44, %r12
	shlq $20, %r8
	shrq $24, %r11
	orq %r8, %r12
	movq $0xffffffc0f, %r8
	andq %r9, %r12
	andq %r8, %r11
	movl %ecx, %r8d
	andl $67108863, %r8d
	movq %rcx, %r9
	movl %r8d, 84(%rdi)
	movq %r12, %r8
	shrq $26, %r9
	shlq $18, %r8
	orq %r8, %r9
	movq %r12, %r8
	shrq $8, %r8
	andl $67108863, %r9d
	andl $67108863, %r8d
	movl %r9d, 92(%rdi)
	movq %r12, %r9
	movl %r8d, 100(%rdi)
	movq %r11, %r8
	shrq $34, %r9
	shlq $10, %r8
	orq %r8, %r9
	movq %r11, %r8
	shrq $16, %r8
	andl $67108863, %r9d
	movl %r9d, 108(%rdi)
	cmpq $16, %r10
	movl %r8d, 116(%rdi)
	movl 16(%rsi), %r8d
	movl %r8d, 124(%rdi)
	movl 20(%rsi), %r8d
	movl %r8d, 132(%rdi)
	movl 24(%rsi), %r8d
	movl %r8d, 140(%rdi)
	movl 28(%rsi), %esi
	movl %esi, 148(%rdi)
	jbe .Lpoly1305_init_ext_sse2_done
	lea (%r11,%r11,4), %r14
	shlq $2, %r14
	lea (%r12,%r12), %rax
	mulq %r14
	movq %rax, %r13
	movq %rcx, %rax
	movq %rdx, %r8
	mulq %rcx
	addq %rax, %r13
	lea (%rcx,%rcx), %rax
	movq %r13, %r9
	adcq %rdx, %r8
	mulq %r12
	shlq $20, %r8
	movq %rax, %rsi
	shrq $44, %r9
	movq %r11, %rax
	orq %r9, %r8
	movq %rdx, %r9
	mulq %r14
	addq %rax, %rsi
	movq %rcx, %rax
	adcq %rdx, %r9
	addq %r11, %r11
	mulq %r11
	addq %rsi, %r8
	movq %rax, %r11
	movq %r12, %rax
	movq %rdx, %rcx
	adcq $0, %r9
	mulq %r12
	addq %rax, %r11
	movq %r8, %rsi
	adcq %rdx, %rcx
	shlq $20, %r9
	shrq $44, %rsi
	orq %rsi, %r9
	movq $0xfffffffffff, %rsi
	addq %r11, %r9
	movq %r9, %r12
	adcq $0, %rcx
	andq %rsi, %r13
	shlq $22, %rcx
	andq %rsi, %r8
	shrq $42, %r12
	orq %r12, %rcx
	movq %rsi, %r12
	lea (%rcx,%rcx,4), %rcx
	addq %rcx, %r13
	movq %rsi, %rcx
	andq %r13, %rcx
	shrq $44, %r13
	movq %rcx, %r14
	addq %r13, %r8
	movq $0x3ffffffffff, %r13
	andq %r8, %r12
	andq %r13, %r9
	shrq $44, %r8
	movq %r12, %r11
	addq %r8, %r9
	movq %r12, %rax
	movq %r9, %r13
	movl %ecx, %r8d
	shrq $26, %r14
	andl $67108863, %r8d
	shlq $18, %r11
	shrq $34, %rax
	orq %r11, %r14
	shlq $10, %r13
	movq %r12, %r11
	orq %r13, %rax
	movq %r9, %r13
	shrq $8, %r11
	shrq $16, %r13
	andl $67108863, %r14d
	andl $67108863, %r11d
	andl $67108863, %eax
	movl %r8d, 88(%rdi)
	cmpq $64, %r10
	movl %r8d, 80(%rdi)
	movl %r14d, 104(%rdi)
	movl %r14d, 96(%rdi)
	movl %r11d, 120(%rdi)
	movl %r11d, 112(%rdi)
	movl %eax, 136(%rdi)
	movl %eax, 128(%rdi)
	movl %r13d, 152(%rdi)
	movl %r13d, 144(%rdi)
	jbe .Lpoly1305_init_ext_sse2_done
	lea (%r9,%r9,4), %r14
	shlq $2, %r14
	lea (%r12,%r12), %rax
	mulq %r14
	movq %rax, %r8
	movq %rcx, %rax
	movq %rdx, %r10
	mulq %rcx
	addq %rax, %r8
	lea (%rcx,%rcx), %rax
	movq %r8, %r11
	adcq %rdx, %r10
	andq %rsi, %r8
	mulq %r12
	shlq $20, %r10
	movq %rax, %r13
	shrq $44, %r11
	movq %r9, %rax
	orq %r11, %r10
	movq %rdx, %r11
	mulq %r14
	addq %rax, %r13
	movq %rcx, %rax
	adcq %rdx, %r11
	addq %r9, %r9
	mulq %r9
	addq %r13, %r10
	movq %rax, %r9
	movq %r12, %rax
	movq %rdx, %rcx
	adcq $0, %r11
	mulq %r12
	addq %rax, %r9
	movq %r10, %r13
	adcq %rdx, %rcx
	andq %rsi, %r10
	shlq $20, %r11
	shrq $44, %r13
	orq %r13, %r11
	addq %r9, %r11
	movq %rsi, %r9
	movq %r11, %r12
	adcq $0, %rcx
	shlq $22, %rcx
	shrq $42, %r12
	orq %r12, %rcx
	lea (%rcx,%rcx,4), %rcx
	addq %rcx, %r8
	andq %r8, %r9
	shrq $44, %r8
	movl %r9d, %eax
	addq %r8, %r10
	movq $0x3ffffffffff, %r8
	andq %r10, %rsi
	andq %r8, %r11
	shrq $44, %r10
	movq %rsi, %r8
	addq %r10, %r11
	andl $67108863, %eax
	shrq $26, %r9
	movq %r11, %r10
	shlq $18, %r8
	shlq $10, %r10
	orq %r8, %r9
	movq %rsi, %r8
	shrq $34, %rsi
	andl $67108863, %r9d
	shrq $8, %r8
	orq %r10, %rsi
	shrq $16, %r11
	andl $67108863, %r8d
	andl $67108863, %esi
	movl %eax, 168(%rdi)
	movl %eax, 160(%rdi)
	movl %r9d, 184(%rdi)
	movl %r9d, 176(%rdi)
	movl %r8d, 200(%rdi)
	movl %r8d, 192(%rdi)
	movl %esi, 216(%rdi)
	movl %esi, 208(%rdi)
	movl %r11d, 232(%rdi)
	movl %r11d, 224(%rdi)
.Lpoly1305_init_ext_sse2_done:
	movq $0, 240(%rdi)
	popq %r14
	popq %r13
	popq %r12
	ret
ELF(.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext;)


.align 8
.globl _gcry_poly1305_amd64_sse2_finish_ext
ELF(.type  _gcry_poly1305_amd64_sse2_finish_ext,@function;)
_gcry_poly1305_amd64_sse2_finish_ext:
.Lpoly1305_finish_ext_x86_local:
	pushq %rbp
	movq %rsp, %rbp
	subq $64, %rsp
	andq $~63, %rsp
	movq %rdx, 32(%rsp)
	movq %rcx, 40(%rsp)
	andq %rdx, %rdx
	jz .Lpoly1305_finish_x86_no_leftover
	pxor %xmm0, %xmm0
	movdqa %xmm0, 0+0(%rsp)
	movdqa %xmm0, 16+0(%rsp)
	leaq 0(%rsp), %r8
	testq $16, %rdx
	jz .Lpoly1305_finish_x86_skip16
	movdqu 0(%rsi), %xmm0
	movdqa %xmm0, 0(%r8)
	addq $16, %rsi
	addq $16, %r8
.Lpoly1305_finish_x86_skip16:
	testq $8, %rdx
	jz .Lpoly1305_finish_x86_skip8
	movq 0(%rsi), %rax
	movq %rax, 0(%r8)
	addq $8, %rsi
	addq $8, %r8
.Lpoly1305_finish_x86_skip8:
	testq $4, %rdx
	jz .Lpoly1305_finish_x86_skip4
	movl 0(%rsi), %eax
	movl %eax, 0(%r8)
	addq $4, %rsi
	addq $4, %r8
.Lpoly1305_finish_x86_skip4:
	testq $2, %rdx
	jz .Lpoly1305_finish_x86_skip2
	movw 0(%rsi), %ax
	movw %ax, 0(%r8)
	addq $2, %rsi
	addq $2, %r8
.Lpoly1305_finish_x86_skip2:
	testq $1, %rdx
	jz .Lpoly1305_finish_x86_skip1
	movb 0(%rsi), %al
	movb %al, 0(%r8)
	addq $1, %r8
.Lpoly1305_finish_x86_skip1:
	cmpq $16, %rdx
	je .Lpoly1305_finish_x86_is16
	movb $1, 0(%r8)
.Lpoly1305_finish_x86_is16:
	movq $4, %rax
	jae .Lpoly1305_finish_x86_16andover
	movq $8, %rax
.Lpoly1305_finish_x86_16andover:
	orq %rax, 240(%rdi)
	leaq 0(%rsp), %rsi
	movq $32, %rdx
	callq .Lpoly1305_blocks_x86_local
.Lpoly1305_finish_x86_no_leftover:
	testq $1, 240(%rdi)
	jz .Lpoly1305_finish_x86_not_started
	movq 32(%rsp), %rdx
	andq %rdx, %rdx
	jz .Lpoly1305_finish_x86_r2r
	cmpq $16, %rdx
	jg .Lpoly1305_finish_x86_r2r
	xorl %r10d, %r10d
	movl 84(%rdi), %eax
	movl 92(%rdi), %ecx
	movl 100(%rdi), %edx
	movl 108(%rdi), %r8d
	movl 116(%rdi), %r9d
	movl %eax, 80(%rdi)
	movl $1, 8+80(%rdi)
	movl %ecx, 96(%rdi)
	movl %r10d, 8+96(%rdi)
	movl %edx, 112(%rdi)
	movl %r10d, 8+112(%rdi)
	movl %r8d, 128(%rdi)
	movl %r10d, 8+128(%rdi)
	movl %r9d, 144(%rdi)
	movl %r10d, 8+144(%rdi)
	jmp .Lpoly1305_finish_x86_combine
.Lpoly1305_finish_x86_r2r:
	movl 84(%rdi), %eax
	movl 92(%rdi), %ecx
	movl 100(%rdi), %edx
	movl 108(%rdi), %r8d
	movl 116(%rdi), %r9d
	movl %eax, 8+80(%rdi)
	movl %ecx, 8+96(%rdi)
	movl %edx, 8+112(%rdi)
	movl %r8d, 8+128(%rdi)
	movl %r9d, 8+144(%rdi)
.Lpoly1305_finish_x86_combine:
	xorq %rsi, %rsi
	movq $32, %rdx
	callq .Lpoly1305_blocks_x86_local
.Lpoly1305_finish_x86_not_started:
	movq 0(%rdi), %r8
	movq 8(%rdi), %r9
	movq %r9, %r10
	movq 16(%rdi), %r11
	shlq $44, %r9
	shrq $20, %r10
	shlq $24, %r11
	orq %r9, %r8
	orq %r11, %r10
	pxor %xmm0, %xmm0
	movl 124(%rdi), %eax
	movl 132(%rdi), %ecx
	movl 140(%rdi), %edx
	movl 148(%rdi), %esi
	movq 40(%rsp), %r11
	shlq $32, %rcx
	shlq $32, %rsi
	orq %rcx, %rax
	orq %rsi, %rdx
	addq %r8, %rax
	adcq %r10, %rdx
	movq %rax, 0(%r11)
	movq %rdx, 8(%r11)
	movq %rbp, %rax
	subq %rsp, %rax
	movq %rbp, %rsp
	movdqa %xmm0, 0(%rdi)
	movdqa %xmm0, 16(%rdi)
	movdqa %xmm0, 32(%rdi)
	movdqa %xmm0, 48(%rdi)
	movdqa %xmm0, 64(%rdi)
	movdqa %xmm0, 80(%rdi)
	movdqa %xmm0, 96(%rdi)
	movdqa %xmm0, 112(%rdi)
	movdqa %xmm0, 128(%rdi)
	movdqa %xmm0, 144(%rdi)
	movdqa %xmm0, 160(%rdi)
	movdqa %xmm0, 176(%rdi)
	movdqa %xmm0, 192(%rdi)
	movdqa %xmm0, 208(%rdi)
	movdqa %xmm0, 224(%rdi)
	popq %rbp
	addq $8, %rax
	ret
ELF(.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext;)


.align 8
.globl _gcry_poly1305_amd64_sse2_blocks
ELF(.type  _gcry_poly1305_amd64_sse2_blocks,@function;)
_gcry_poly1305_amd64_sse2_blocks:
.Lpoly1305_blocks_x86_local:
	pushq %rbp
	movq %rsp, %rbp
	pushq %rbx
	andq $-64, %rsp
	subq $328, %rsp
	movq 240(%rdi), %rax
	movl $(1<<24), %r8d
	movl $((1<<26)-1), %r9d
	movd %r8, %xmm0
	movd %r9, %xmm5
	pshufd $0x44, %xmm0, %xmm0
	pshufd $0x44, %xmm5, %xmm5
	testb $4, %al
	je .Lpoly1305_blocks_x86_3
	psrldq $8, %xmm0
.Lpoly1305_blocks_x86_3:
	testb $8, %al
	je .Lpoly1305_blocks_x86_4
	pxor %xmm0, %xmm0
.Lpoly1305_blocks_x86_4:
	movdqa %xmm0, 168(%rsp)
	testb $1, %al
	jne .Lpoly1305_blocks_x86_5
	movq 16(%rsi), %xmm0
	movdqa %xmm5, %xmm7
	movdqa %xmm5, %xmm10
	movq (%rsi), %xmm6
	orq $1, %rax
	subq $32, %rdx
	movq 8(%rsi), %xmm1
	punpcklqdq %xmm0, %xmm6
	movq 24(%rsi), %xmm0
	pand %xmm6, %xmm7
	movdqa %xmm6, %xmm9
	psrlq $52, %xmm6
	addq $32, %rsi
	punpcklqdq %xmm0, %xmm1
	movdqa %xmm1, %xmm0
	psrlq $26, %xmm9
	psllq $12, %xmm0
	movq %rax, 240(%rdi)
	pand %xmm5, %xmm9
	por %xmm0, %xmm6
	psrlq $40, %xmm1
	pand %xmm6, %xmm10
	por 168(%rsp), %xmm1
	psrlq $26, %xmm6
	pand %xmm5, %xmm6
.Lpoly1305_blocks_x86_6:
	movdqa 80(%rdi), %xmm13
	cmpq $63, %rdx
	movl $(5), %r8d
	movd %r8, %xmm14
	pshufd $0x44, %xmm14, %xmm14
	movdqa 96(%rdi), %xmm15
	movdqa %xmm13, -8(%rsp)
	movdqa 112(%rdi), %xmm0
	movdqa %xmm14, 136(%rsp)
	movdqa 128(%rdi), %xmm3
	movdqa %xmm15, 312(%rsp)
	pmuludq %xmm14, %xmm15
	movdqa 144(%rdi), %xmm13
	movdqa %xmm0, 232(%rsp)
	pmuludq %xmm14, %xmm0
	movdqa %xmm3, 152(%rsp)
	pmuludq %xmm14, %xmm3
	movdqa %xmm13, 56(%rsp)
	pmuludq %xmm14, %xmm13
	movdqa %xmm15, 40(%rsp)
	movdqa %xmm0, -24(%rsp)
	movdqa %xmm3, -40(%rsp)
	movdqa %xmm13, -56(%rsp)
	jbe .Lpoly1305_blocks_x86_7
	movdqa 192(%rdi), %xmm15
	leaq 32(%rsi), %rax
	movq %rdx, %rcx
	movdqa 176(%rdi), %xmm14
	movdqa %xmm15, %xmm2
	movdqa 208(%rdi), %xmm0
	movdqa %xmm15, 216(%rsp)
	movdqa %xmm14, 296(%rsp)
	movdqa 224(%rdi), %xmm3
	pmuludq 136(%rsp), %xmm14
	movdqa -24(%rsp), %xmm13
	movdqa %xmm14, 8(%rsp)
	pmuludq 136(%rsp), %xmm2
	movdqa -40(%rsp), %xmm14
	movdqa %xmm0, 120(%rsp)
	pmuludq 136(%rsp), %xmm0
	movdqa %xmm3, 24(%rsp)
	movdqa 160(%rdi), %xmm12
	movdqa %xmm0, %xmm8
	movdqa -56(%rsp), %xmm15
	movdqa %xmm13, 88(%rsp)
	pmuludq 136(%rsp), %xmm3
	movdqa %xmm2, 104(%rsp)
	movdqa %xmm0, %xmm13
	movdqa -8(%rsp), %xmm11
	movdqa %xmm3, 280(%rsp)
	movdqa %xmm2, %xmm3
	movdqa %xmm0, 200(%rsp)
	movdqa %xmm14, 184(%rsp)
	movdqa %xmm15, 264(%rsp)
	jmp .Lpoly1305_blocks_x86_8
.p2align 6,,63
.Lpoly1305_blocks_x86_13:
	movdqa 200(%rsp), %xmm13
	movdqa %xmm3, %xmm6
	movdqa 200(%rsp), %xmm8
	movdqa 104(%rsp), %xmm3
.Lpoly1305_blocks_x86_8:
	movdqa 8(%rsp), %xmm4
	pmuludq %xmm6, %xmm3
	subq $64, %rcx
	pmuludq %xmm10, %xmm8
	movdqa 104(%rsp), %xmm2
	movdqa 200(%rsp), %xmm0
	pmuludq %xmm1, %xmm4
	movdqa 280(%rsp), %xmm15
	pmuludq %xmm6, %xmm13
	movdqa 280(%rsp), %xmm14
	pmuludq %xmm1, %xmm0
	paddq %xmm3, %xmm4
	pmuludq %xmm1, %xmm2
	movdqa 280(%rsp), %xmm3
	paddq %xmm8, %xmm4
	pmuludq %xmm9, %xmm15
	movdqa 280(%rsp), %xmm8
	pmuludq %xmm10, %xmm14
	pmuludq %xmm6, %xmm8
	paddq %xmm13, %xmm2
	movdqa %xmm6, %xmm13
	pmuludq %xmm1, %xmm3
	paddq %xmm15, %xmm4
	movdqa 296(%rsp), %xmm15
	pmuludq %xmm12, %xmm13
	paddq %xmm14, %xmm2
	movdqa %xmm7, %xmm14
	paddq %xmm8, %xmm0
	pmuludq %xmm12, %xmm14
	movdqa %xmm9, %xmm8
	pmuludq 296(%rsp), %xmm6
	pmuludq %xmm12, %xmm8
	movdqa %xmm6, 248(%rsp)
	pmuludq %xmm10, %xmm15
	movq -16(%rax), %xmm6
	paddq %xmm13, %xmm3
	movdqa %xmm10, %xmm13
	paddq %xmm14, %xmm4
	movq -8(%rax), %xmm14
	paddq %xmm8, %xmm2
	movq -32(%rax), %xmm8
	pmuludq %xmm12, %xmm13
	paddq %xmm15, %xmm3
	pmuludq %xmm12, %xmm1
	movdqa 216(%rsp), %xmm15
	pmuludq 216(%rsp), %xmm10
	punpcklqdq %xmm6, %xmm8
	movq -24(%rax), %xmm6
	pmuludq %xmm9, %xmm15
	paddq %xmm13, %xmm0
	movdqa 296(%rsp), %xmm13
	paddq 248(%rsp), %xmm1
	punpcklqdq %xmm14, %xmm6
	movdqa 296(%rsp), %xmm14
	pmuludq %xmm9, %xmm13
	pmuludq 120(%rsp), %xmm9
	movdqa %xmm15, 72(%rsp)
	paddq %xmm10, %xmm1
	movdqa 216(%rsp), %xmm15
	pmuludq %xmm7, %xmm14
	movdqa %xmm6, %xmm10
	paddq %xmm9, %xmm1
	pmuludq %xmm7, %xmm15
	paddq %xmm13, %xmm0
	paddq 72(%rsp), %xmm3
	movdqa 120(%rsp), %xmm13
	psllq $12, %xmm10
	paddq %xmm14, %xmm2
	movdqa %xmm5, %xmm14
	pand %xmm8, %xmm14
	pmuludq %xmm7, %xmm13
	paddq %xmm15, %xmm0
	movdqa %xmm14, 248(%rsp)
	movdqa %xmm8, %xmm14
	psrlq $52, %xmm8
	movdqu (%rax), %xmm9
	por %xmm10, %xmm8
	pmuludq 24(%rsp), %xmm7
	movdqu 16(%rax), %xmm10
	paddq %xmm13, %xmm3
	pxor %xmm13, %xmm13
	movdqa %xmm9, %xmm15
	paddq %xmm7, %xmm1
	movdqa %xmm6, %xmm7
	movdqa %xmm10, -72(%rsp)
	punpckldq %xmm10, %xmm15
	movdqa %xmm15, %xmm10
	punpckldq %xmm13, %xmm10
	punpckhdq -72(%rsp), %xmm9
	psrlq $40, %xmm6
	movdqa %xmm10, 72(%rsp)
	movdqa %xmm9, %xmm10
	punpckhdq %xmm13, %xmm9
	psllq $18, %xmm9
	paddq 72(%rsp), %xmm4
	addq $64, %rax
	paddq %xmm9, %xmm3
	movdqa 40(%rsp), %xmm9
	cmpq $63, %rcx
	punpckhdq %xmm13, %xmm15
	psllq $6, %xmm15
	punpckldq %xmm13, %xmm10
	paddq %xmm15, %xmm2
	psllq $12, %xmm10
	por 168(%rsp), %xmm6
	pmuludq %xmm6, %xmm9
	movdqa 88(%rsp), %xmm15
	paddq %xmm10, %xmm0
	movdqa 88(%rsp), %xmm13
	psrlq $14, %xmm7
	pand %xmm5, %xmm8
	movdqa 184(%rsp), %xmm10
	pand %xmm5, %xmm7
	pmuludq %xmm7, %xmm15
	paddq %xmm9, %xmm4
	pmuludq %xmm6, %xmm13
	movdqa 184(%rsp), %xmm9
	paddq 168(%rsp), %xmm1
	pmuludq %xmm7, %xmm10
	pmuludq %xmm6, %xmm9
	paddq %xmm15, %xmm4
	movdqa 184(%rsp), %xmm15
	paddq %xmm13, %xmm2
	psrlq $26, %xmm14
	movdqa 264(%rsp), %xmm13
	paddq %xmm10, %xmm2
	pmuludq %xmm8, %xmm15
	pand %xmm5, %xmm14
	paddq %xmm9, %xmm0
	pmuludq %xmm6, %xmm13
	movdqa 264(%rsp), %xmm9
	movdqa 264(%rsp), %xmm10
	pmuludq %xmm11, %xmm6
	pmuludq %xmm8, %xmm9
	paddq %xmm15, %xmm4
	movdqa 264(%rsp), %xmm15
	pmuludq %xmm14, %xmm10
	paddq %xmm13, %xmm3
	movdqa %xmm7, %xmm13
	pmuludq %xmm7, %xmm15
	paddq %xmm6, %xmm1
	movdqa 312(%rsp), %xmm6
	paddq %xmm9, %xmm2
	pmuludq %xmm11, %xmm13
	movdqa 248(%rsp), %xmm9
	paddq %xmm10, %xmm4
	pmuludq %xmm8, %xmm6
	pmuludq 312(%rsp), %xmm7
	paddq %xmm15, %xmm0
	movdqa %xmm9, %xmm10
	movdqa %xmm14, %xmm15
	pmuludq %xmm11, %xmm10
	paddq %xmm13, %xmm3
	movdqa %xmm8, %xmm13
	pmuludq %xmm11, %xmm13
	paddq %xmm6, %xmm3
	paddq %xmm7, %xmm1
	movdqa 232(%rsp), %xmm6
	pmuludq %xmm11, %xmm15
	pmuludq 232(%rsp), %xmm8
	paddq %xmm10, %xmm4
	paddq %xmm8, %xmm1
	movdqa 312(%rsp), %xmm10
	paddq %xmm13, %xmm0
	pmuludq %xmm14, %xmm6
	movdqa 312(%rsp), %xmm13
	pmuludq %xmm9, %xmm10
	paddq %xmm15, %xmm2
	movdqa 232(%rsp), %xmm7
	pmuludq %xmm14, %xmm13
	pmuludq 152(%rsp), %xmm14
	paddq %xmm14, %xmm1
	pmuludq %xmm9, %xmm7
	paddq %xmm6, %xmm3
	paddq %xmm10, %xmm2
	movdqa 152(%rsp), %xmm10
	paddq %xmm13, %xmm0
	pmuludq %xmm9, %xmm10
	paddq %xmm7, %xmm0
	movdqa %xmm4, %xmm7
	psrlq $26, %xmm7
	pmuludq 56(%rsp), %xmm9
	pand %xmm5, %xmm4
	paddq %xmm7, %xmm2
	paddq %xmm9, %xmm1
	paddq %xmm10, %xmm3
	movdqa %xmm2, %xmm7
	movdqa %xmm2, %xmm9
	movdqa %xmm3, %xmm6
	psrlq $26, %xmm7
	pand %xmm5, %xmm3
	psrlq $26, %xmm6
	paddq %xmm7, %xmm0
	pand %xmm5, %xmm9
	paddq %xmm6, %xmm1
	movdqa %xmm0, %xmm10
	movdqa %xmm1, %xmm6
	pand %xmm5, %xmm10
	pand %xmm5, %xmm1
	psrlq $26, %xmm6
	pmuludq 136(%rsp), %xmm6
	paddq %xmm6, %xmm4
	movdqa %xmm0, %xmm6
	psrlq $26, %xmm6
	movdqa %xmm4, %xmm2
	movdqa %xmm4, %xmm7
	paddq %xmm6, %xmm3
	psrlq $26, %xmm2
	pand %xmm5, %xmm7
	movdqa %xmm3, %xmm0
	paddq %xmm2, %xmm9
	pand %xmm5, %xmm3
	psrlq $26, %xmm0
	paddq %xmm0, %xmm1
	ja .Lpoly1305_blocks_x86_13
	leaq -64(%rdx), %rax
	movdqa %xmm3, %xmm6
	andl $63, %edx
	andq $-64, %rax
	leaq 64(%rsi,%rax), %rsi
.Lpoly1305_blocks_x86_7:
	cmpq $31, %rdx
	jbe .Lpoly1305_blocks_x86_9
	movdqa -24(%rsp), %xmm13
	movdqa %xmm6, %xmm0
	movdqa %xmm6, %xmm3
	movdqa 40(%rsp), %xmm11
	movdqa %xmm1, %xmm12
	testq %rsi, %rsi
	movdqa -40(%rsp), %xmm2
	pmuludq %xmm13, %xmm0
	movdqa %xmm1, %xmm8
	pmuludq %xmm1, %xmm11
	movdqa %xmm10, %xmm4
	movdqa %xmm1, %xmm14
	pmuludq %xmm2, %xmm3
	movdqa %xmm6, %xmm15
	pmuludq %xmm1, %xmm13
	movdqa %xmm7, %xmm1
	pmuludq %xmm2, %xmm12
	paddq %xmm0, %xmm11
	movdqa -56(%rsp), %xmm0
	pmuludq %xmm10, %xmm2
	paddq %xmm3, %xmm13
	pmuludq %xmm0, %xmm4
	movdqa %xmm9, %xmm3
	pmuludq %xmm0, %xmm3
	paddq %xmm2, %xmm11
	pmuludq %xmm0, %xmm8
	movdqa %xmm6, %xmm2
	pmuludq %xmm0, %xmm2
	movdqa -8(%rsp), %xmm0
	paddq %xmm4, %xmm13
	movdqa 312(%rsp), %xmm4
	paddq %xmm3, %xmm11
	pmuludq 312(%rsp), %xmm6
	movdqa 312(%rsp), %xmm3
	pmuludq %xmm0, %xmm1
	paddq %xmm2, %xmm12
	pmuludq %xmm0, %xmm15
	movdqa %xmm9, %xmm2
	pmuludq %xmm0, %xmm2
	pmuludq %xmm7, %xmm3
	paddq %xmm1, %xmm11
	movdqa 232(%rsp), %xmm1
	pmuludq %xmm0, %xmm14
	paddq %xmm15, %xmm8
	pmuludq %xmm10, %xmm0
	paddq %xmm2, %xmm13
	movdqa 312(%rsp), %xmm2
	pmuludq %xmm10, %xmm4
	paddq %xmm3, %xmm13
	movdqa 152(%rsp), %xmm3
	pmuludq %xmm9, %xmm2
	paddq %xmm6, %xmm14
	pmuludq 232(%rsp), %xmm10
	paddq %xmm0, %xmm12
	pmuludq %xmm9, %xmm1
	paddq %xmm10, %xmm14
	movdqa 232(%rsp), %xmm0
	pmuludq %xmm7, %xmm3
	paddq %xmm4, %xmm8
	pmuludq 152(%rsp), %xmm9
	paddq %xmm2, %xmm12
	paddq %xmm9, %xmm14
	pmuludq %xmm7, %xmm0
	paddq %xmm1, %xmm8
	pmuludq 56(%rsp), %xmm7
	paddq %xmm3, %xmm8
	paddq %xmm7, %xmm14
	paddq %xmm0, %xmm12
	je .Lpoly1305_blocks_x86_10
	movdqu (%rsi), %xmm1
	pxor %xmm0, %xmm0
	paddq 168(%rsp), %xmm14
	movdqu 16(%rsi), %xmm2
	movdqa %xmm1, %xmm3
	punpckldq %xmm2, %xmm3
	punpckhdq %xmm2, %xmm1
	movdqa %xmm3, %xmm4
	movdqa %xmm1, %xmm2
	punpckldq %xmm0, %xmm4
	punpckhdq %xmm0, %xmm3
	punpckhdq %xmm0, %xmm1
	punpckldq %xmm0, %xmm2
	movdqa %xmm2, %xmm0
	psllq $6, %xmm3
	paddq %xmm4, %xmm11
	psllq $12, %xmm0
	paddq %xmm3, %xmm13
	psllq $18, %xmm1
	paddq %xmm0, %xmm12
	paddq %xmm1, %xmm8
.Lpoly1305_blocks_x86_10:
	movdqa %xmm11, %xmm9
	movdqa %xmm8, %xmm1
	movdqa %xmm11, %xmm7
	psrlq $26, %xmm9
	movdqa %xmm8, %xmm6
	pand %xmm5, %xmm7
	paddq %xmm13, %xmm9
	psrlq $26, %xmm1
	pand %xmm5, %xmm6
	movdqa %xmm9, %xmm10
	paddq %xmm14, %xmm1
	pand %xmm5, %xmm9
	psrlq $26, %xmm10
	movdqa %xmm1, %xmm0
	pand %xmm5, %xmm1
	paddq %xmm12, %xmm10
	psrlq $26, %xmm0
	pmuludq 136(%rsp), %xmm0
	movdqa %xmm10, %xmm2
	paddq %xmm0, %xmm7
	psrlq $26, %xmm2
	movdqa %xmm7, %xmm0
	pand %xmm5, %xmm10
	paddq %xmm2, %xmm6
	psrlq $26, %xmm0
	pand %xmm5, %xmm7
	movdqa %xmm6, %xmm2
	paddq %xmm0, %xmm9
	pand %xmm5, %xmm6
	psrlq $26, %xmm2
	paddq %xmm2, %xmm1
.Lpoly1305_blocks_x86_9:
	testq %rsi, %rsi
	je .Lpoly1305_blocks_x86_11
	movdqa %xmm7, 0(%rdi)
	movdqa %xmm9, 16(%rdi)
	movdqa %xmm10, 32(%rdi)
	movdqa %xmm6, 48(%rdi)
	movdqa %xmm1, 64(%rdi)
	movq -8(%rbp), %rbx
	leave
	ret
.Lpoly1305_blocks_x86_5:
	movdqa 0(%rdi), %xmm7
	movdqa 16(%rdi), %xmm9
	movdqa 32(%rdi), %xmm10
	movdqa 48(%rdi), %xmm6
	movdqa 64(%rdi), %xmm1
	jmp .Lpoly1305_blocks_x86_6
.Lpoly1305_blocks_x86_11:
	movdqa %xmm7, %xmm0
	movdqa %xmm9, %xmm2
	movdqa %xmm6, %xmm3
	psrldq $8, %xmm0
	movabsq $4398046511103, %rbx
	paddq %xmm0, %xmm7
	psrldq $8, %xmm2
	movdqa %xmm10, %xmm0
	movd %xmm7, %edx
	paddq %xmm2, %xmm9
	psrldq $8, %xmm0
	movl %edx, %ecx
	movd %xmm9, %eax
	paddq %xmm0, %xmm10
	shrl $26, %ecx
	psrldq $8, %xmm3
	movdqa %xmm1, %xmm0
	addl %ecx, %eax
	movd %xmm10, %ecx
	paddq %xmm3, %xmm6
	movl %eax, %r9d
	shrl $26, %eax
	psrldq $8, %xmm0
	addl %ecx, %eax
	movd %xmm6, %ecx
	paddq %xmm0, %xmm1
	movl %eax, %esi
	andl $67108863, %r9d
	movd %xmm1, %r10d
	shrl $26, %esi
	andl $67108863, %eax
	andl $67108863, %edx
	addl %ecx, %esi
	salq $8, %rax
	movl %r9d, %ecx
	shrl $18, %r9d
	movl %esi, %r8d
	shrl $26, %esi
	andl $67108863, %r8d
	addl %r10d, %esi
	orq %r9, %rax
	salq $16, %rsi
	movq %r8, %r9
	shrl $10, %r8d
	salq $26, %rcx
	orq %r8, %rsi
	salq $34, %r9
	orq %rdx, %rcx
	movq %rsi, %r8
	shrq $42, %rsi
	movabsq $17592186044415, %rdx
	orq %r9, %rax
	andq %rbx, %r8
	leaq (%rsi,%rsi,4), %rsi
	andq %rdx, %rcx
	andq %rdx, %rax
	movabsq $-4398046511104, %r10
	addq %rsi, %rcx
	movq %rcx, %rsi
	shrq $44, %rcx
	addq %rcx, %rax
	andq %rdx, %rsi
	movq %rax, %rcx
	shrq $44, %rax
	addq %r8, %rax
	andq %rdx, %rcx
	andq %rax, %rbx
	shrq $42, %rax
	leaq (%rsi,%rax,4), %rsi
	addq %rbx, %r10
	addq %rax, %rsi
	movq %rsi, %r8
	shrq $44, %rsi
	andq %rdx, %r8
	addq %rcx, %rsi
	leaq 5(%r8), %r9
	movq %r9, %r11
	andq %rdx, %r9
	shrq $44, %r11
	addq %rsi, %r11
	movq %r11, %rax
	andq %r11, %rdx
	shrq $44, %rax
	addq %rax, %r10
	movq %r10, %rax
	shrq $63, %rax
	subq $1, %rax
	movq %rax, %rcx
	andq %rax, %r9
	andq %rax, %rdx
	notq %rcx
	andq %r10, %rax
	andq %rcx, %r8
	andq %rcx, %rsi
	andq %rbx, %rcx
	orq %r9, %r8
	orq %rdx, %rsi
	orq %rax, %rcx
	movq %r8, 0(%rdi)
	movq %rsi, 8(%rdi)
	movq %rcx, 16(%rdi)
	movq -8(%rbp), %rbx
	movq %rbp, %rax
	subq %rsp, %rax
	pxor %xmm15, %xmm15
	pxor %xmm7, %xmm7
	pxor %xmm14, %xmm14
	pxor %xmm6, %xmm6
	pxor %xmm13, %xmm13
	pxor %xmm5, %xmm5
	pxor %xmm12, %xmm12
	pxor %xmm4, %xmm4
	leave
	addq $8, %rax
	pxor %xmm11, %xmm11
	pxor %xmm3, %xmm3
	pxor %xmm10, %xmm10
	pxor %xmm2, %xmm2
	pxor %xmm9, %xmm9
	pxor %xmm1, %xmm1
	pxor %xmm8, %xmm8
	pxor %xmm0, %xmm0
	ret
ELF(.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks;)

#endif