cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

poly1305-x86_64-cryptogams.pl (101913B)


      1#!/usr/bin/env perl
      2# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
      3#
      4# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
      5# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
      6# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
      7#
      8# This code is taken from the OpenSSL project but the author, Andy Polyakov,
      9# has relicensed it under the licenses specified in the SPDX header above.
     10# The original headers, including the original license headers, are
     11# included below for completeness.
     12#
     13# ====================================================================
     14# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     15# project. The module is, however, dual licensed under OpenSSL and
     16# CRYPTOGAMS licenses depending on where you obtain it. For further
     17# details see http://www.openssl.org/~appro/cryptogams/.
     18# ====================================================================
     19#
     20# This module implements Poly1305 hash for x86_64.
     21#
     22# March 2015
     23#
     24# Initial release.
     25#
     26# December 2016
     27#
     28# Add AVX512F+VL+BW code path.
     29#
     30# November 2017
     31#
     32# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
     33# executed even on Knights Landing. Trigger for modification was
     34# observation that AVX512 code paths can negatively affect overall
     35# Skylake-X system performance. Since we are likely to suppress
     36# AVX512F capability flag [at least on Skylake-X], conversion serves
     37# as kind of "investment protection". Note that next *lake processor,
     38# Cannonlake, has AVX512IFMA code path to execute...
     39#
     40# Numbers are cycles per processed byte with poly1305_blocks alone,
     41# measured with rdtsc at fixed clock frequency.
     42#
     43#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
     44# P4		4.46/+120%	-
     45# Core 2	2.41/+90%	-
     46# Westmere	1.88/+120%	-
     47# Sandy Bridge	1.39/+140%	1.10
     48# Haswell	1.14/+175%	1.11		0.65
     49# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
     50# Silvermont	2.83/+95%	-
     51# Knights L	3.60/?		1.65		1.10	0.41(***)
     52# Goldmont	1.70/+180%	-
     53# VIA Nano	1.82/+150%	-
     54# Sledgehammer	1.38/+160%	-
     55# Bulldozer	2.30/+130%	0.97
     56# Ryzen		1.15/+200%	1.08		1.18
     57#
     58# (*)	improvement coefficients relative to clang are more modest and
     59#	are ~50% on most processors, in both cases we are comparing to
     60#	__int128 code;
     61# (**)	SSE2 implementation was attempted, but among non-AVX processors
     62#	it was faster than integer-only code only on older Intel P4 and
     63#	Core processors, 50-30%, less newer processor is, but slower on
     64#	contemporary ones, for example almost 2x slower on Atom, and as
     65#	former are naturally disappearing, SSE2 is deemed unnecessary;
     66# (***)	strangely enough performance seems to vary from core to core,
     67#	listed result is best case;
     68
     69$flavour = shift;
     70$output  = shift;
     71if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     72
     73$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     74$kernel=0; $kernel=1 if (!$flavour && !$output);
     75
     76if (!$kernel) {
     77	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     78	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     79	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     80	die "can't locate x86_64-xlate.pl";
     81
     82	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     83	*STDOUT=*OUT;
     84
     85	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
     86	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
     87		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
     88	}
     89
     90	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
     91	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
     92		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
     93		$avx += 1 if ($1==2.11 && $2>=8);
     94	}
     95
     96	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
     97	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
     98		$avx = ($1>=10) + ($1>=11);
     99	}
    100
    101	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
    102		$avx = ($2>=3.0) + ($2>3.0);
    103	}
    104} else {
    105	$avx = 4; # The kernel uses ifdefs for this.
    106}
    107
    108sub declare_function() {
    109	my ($name, $align, $nargs) = @_;
    110	if($kernel) {
    111		$code .= ".align $align\n";
    112		$code .= "SYM_FUNC_START($name)\n";
    113		$code .= ".L$name:\n";
    114	} else {
    115		$code .= ".globl	$name\n";
    116		$code .= ".type	$name,\@function,$nargs\n";
    117		$code .= ".align	$align\n";
    118		$code .= "$name:\n";
    119	}
    120}
    121
    122sub end_function() {
    123	my ($name) = @_;
    124	if($kernel) {
    125		$code .= "SYM_FUNC_END($name)\n";
    126	} else {
    127		$code .= ".size   $name,.-$name\n";
    128	}
    129}
    130
    131$code.=<<___ if $kernel;
    132#include <linux/linkage.h>
    133___
    134
    135if ($avx) {
    136$code.=<<___ if $kernel;
    137.section .rodata
    138___
    139$code.=<<___;
    140.align	64
    141.Lconst:
    142.Lmask24:
    143.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
    144.L129:
    145.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
    146.Lmask26:
    147.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
    148.Lpermd_avx2:
    149.long	2,2,2,3,2,0,2,1
    150.Lpermd_avx512:
    151.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
    152
    153.L2_44_inp_permd:
    154.long	0,1,1,2,2,3,7,7
    155.L2_44_inp_shift:
    156.quad	0,12,24,64
    157.L2_44_mask:
    158.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
    159.L2_44_shift_rgt:
    160.quad	44,44,42,64
    161.L2_44_shift_lft:
    162.quad	8,8,10,64
    163
    164.align	64
    165.Lx_mask44:
    166.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
    167.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
    168.Lx_mask42:
    169.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
    170.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
    171___
    172}
    173$code.=<<___ if (!$kernel);
    174.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    175.align	16
    176___
    177
    178my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
    179my ($mac,$nonce)=($inp,$len);	# *_emit arguments
    180my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
    181my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
    182
    183sub poly1305_iteration {
    184# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
    185# output:	$h0-$h2 *= $r0-$r1
    186$code.=<<___;
    187	mulq	$h0			# h0*r1
    188	mov	%rax,$d2
    189	 mov	$r0,%rax
    190	mov	%rdx,$d3
    191
    192	mulq	$h0			# h0*r0
    193	mov	%rax,$h0		# future $h0
    194	 mov	$r0,%rax
    195	mov	%rdx,$d1
    196
    197	mulq	$h1			# h1*r0
    198	add	%rax,$d2
    199	 mov	$s1,%rax
    200	adc	%rdx,$d3
    201
    202	mulq	$h1			# h1*s1
    203	 mov	$h2,$h1			# borrow $h1
    204	add	%rax,$h0
    205	adc	%rdx,$d1
    206
    207	imulq	$s1,$h1			# h2*s1
    208	add	$h1,$d2
    209	 mov	$d1,$h1
    210	adc	\$0,$d3
    211
    212	imulq	$r0,$h2			# h2*r0
    213	add	$d2,$h1
    214	mov	\$-4,%rax		# mask value
    215	adc	$h2,$d3
    216
    217	and	$d3,%rax		# last reduction step
    218	mov	$d3,$h2
    219	shr	\$2,$d3
    220	and	\$3,$h2
    221	add	$d3,%rax
    222	add	%rax,$h0
    223	adc	\$0,$h1
    224	adc	\$0,$h2
    225___
    226}
    227
    228########################################################################
    229# Layout of opaque area is following.
    230#
    231#	unsigned __int64 h[3];		# current hash value base 2^64
    232#	unsigned __int64 r[2];		# key value base 2^64
    233
    234$code.=<<___;
    235.text
    236___
    237$code.=<<___ if (!$kernel);
    238.extern	OPENSSL_ia32cap_P
    239
    240.globl	poly1305_init_x86_64
    241.hidden	poly1305_init_x86_64
    242.globl	poly1305_blocks_x86_64
    243.hidden	poly1305_blocks_x86_64
    244.globl	poly1305_emit_x86_64
    245.hidden	poly1305_emit_x86_64
    246___
    247&declare_function("poly1305_init_x86_64", 32, 3);
    248$code.=<<___;
    249	xor	%eax,%eax
    250	mov	%rax,0($ctx)		# initialize hash value
    251	mov	%rax,8($ctx)
    252	mov	%rax,16($ctx)
    253
    254	test	$inp,$inp
    255	je	.Lno_key
    256___
    257$code.=<<___ if (!$kernel);
    258	lea	poly1305_blocks_x86_64(%rip),%r10
    259	lea	poly1305_emit_x86_64(%rip),%r11
    260___
    261$code.=<<___	if (!$kernel && $avx);
    262	mov	OPENSSL_ia32cap_P+4(%rip),%r9
    263	lea	poly1305_blocks_avx(%rip),%rax
    264	lea	poly1305_emit_avx(%rip),%rcx
    265	bt	\$`60-32`,%r9		# AVX?
    266	cmovc	%rax,%r10
    267	cmovc	%rcx,%r11
    268___
    269$code.=<<___	if (!$kernel && $avx>1);
    270	lea	poly1305_blocks_avx2(%rip),%rax
    271	bt	\$`5+32`,%r9		# AVX2?
    272	cmovc	%rax,%r10
    273___
    274$code.=<<___	if (!$kernel && $avx>3);
    275	mov	\$`(1<<31|1<<21|1<<16)`,%rax
    276	shr	\$32,%r9
    277	and	%rax,%r9
    278	cmp	%rax,%r9
    279	je	.Linit_base2_44
    280___
    281$code.=<<___;
    282	mov	\$0x0ffffffc0fffffff,%rax
    283	mov	\$0x0ffffffc0ffffffc,%rcx
    284	and	0($inp),%rax
    285	and	8($inp),%rcx
    286	mov	%rax,24($ctx)
    287	mov	%rcx,32($ctx)
    288___
    289$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
    290	mov	%r10,0(%rdx)
    291	mov	%r11,8(%rdx)
    292___
    293$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
    294	mov	%r10d,0(%rdx)
    295	mov	%r11d,4(%rdx)
    296___
    297$code.=<<___;
    298	mov	\$1,%eax
    299.Lno_key:
    300	RET
    301___
    302&end_function("poly1305_init_x86_64");
    303
    304&declare_function("poly1305_blocks_x86_64", 32, 4);
    305$code.=<<___;
    306.cfi_startproc
    307.Lblocks:
    308	shr	\$4,$len
    309	jz	.Lno_data		# too short
    310
    311	push	%rbx
    312.cfi_push	%rbx
    313	push	%r12
    314.cfi_push	%r12
    315	push	%r13
    316.cfi_push	%r13
    317	push	%r14
    318.cfi_push	%r14
    319	push	%r15
    320.cfi_push	%r15
    321	push	$ctx
    322.cfi_push	$ctx
    323.Lblocks_body:
    324
    325	mov	$len,%r15		# reassign $len
    326
    327	mov	24($ctx),$r0		# load r
    328	mov	32($ctx),$s1
    329
    330	mov	0($ctx),$h0		# load hash value
    331	mov	8($ctx),$h1
    332	mov	16($ctx),$h2
    333
    334	mov	$s1,$r1
    335	shr	\$2,$s1
    336	mov	$r1,%rax
    337	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
    338	jmp	.Loop
    339
    340.align	32
    341.Loop:
    342	add	0($inp),$h0		# accumulate input
    343	adc	8($inp),$h1
    344	lea	16($inp),$inp
    345	adc	$padbit,$h2
    346___
    347
    348	&poly1305_iteration();
    349
    350$code.=<<___;
    351	mov	$r1,%rax
    352	dec	%r15			# len-=16
    353	jnz	.Loop
    354
    355	mov	0(%rsp),$ctx
    356.cfi_restore	$ctx
    357
    358	mov	$h0,0($ctx)		# store hash value
    359	mov	$h1,8($ctx)
    360	mov	$h2,16($ctx)
    361
    362	mov	8(%rsp),%r15
    363.cfi_restore	%r15
    364	mov	16(%rsp),%r14
    365.cfi_restore	%r14
    366	mov	24(%rsp),%r13
    367.cfi_restore	%r13
    368	mov	32(%rsp),%r12
    369.cfi_restore	%r12
    370	mov	40(%rsp),%rbx
    371.cfi_restore	%rbx
    372	lea	48(%rsp),%rsp
    373.cfi_adjust_cfa_offset	-48
    374.Lno_data:
    375.Lblocks_epilogue:
    376	RET
    377.cfi_endproc
    378___
    379&end_function("poly1305_blocks_x86_64");
    380
    381&declare_function("poly1305_emit_x86_64", 32, 3);
    382$code.=<<___;
    383.Lemit:
    384	mov	0($ctx),%r8	# load hash value
    385	mov	8($ctx),%r9
    386	mov	16($ctx),%r10
    387
    388	mov	%r8,%rax
    389	add	\$5,%r8		# compare to modulus
    390	mov	%r9,%rcx
    391	adc	\$0,%r9
    392	adc	\$0,%r10
    393	shr	\$2,%r10	# did 130-bit value overflow?
    394	cmovnz	%r8,%rax
    395	cmovnz	%r9,%rcx
    396
    397	add	0($nonce),%rax	# accumulate nonce
    398	adc	8($nonce),%rcx
    399	mov	%rax,0($mac)	# write result
    400	mov	%rcx,8($mac)
    401
    402	RET
    403___
    404&end_function("poly1305_emit_x86_64");
    405if ($avx) {
    406
    407########################################################################
    408# Layout of opaque area is following.
    409#
    410#	unsigned __int32 h[5];		# current hash value base 2^26
    411#	unsigned __int32 is_base2_26;
    412#	unsigned __int64 r[2];		# key value base 2^64
    413#	unsigned __int64 pad;
    414#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
    415#
    416# where r^n are base 2^26 digits of degrees of multiplier key. There are
    417# 5 digits, but last four are interleaved with multiples of 5, totalling
    418# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
    419
    420my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
    421    map("%xmm$_",(0..15));
    422
    423$code.=<<___;
    424.type	__poly1305_block,\@abi-omnipotent
    425.align	32
    426__poly1305_block:
    427	push $ctx
    428___
    429	&poly1305_iteration();
    430$code.=<<___;
    431	pop $ctx
    432	RET
    433.size	__poly1305_block,.-__poly1305_block
    434
    435.type	__poly1305_init_avx,\@abi-omnipotent
    436.align	32
    437__poly1305_init_avx:
    438	push %rbp
    439	mov %rsp,%rbp
    440	mov	$r0,$h0
    441	mov	$r1,$h1
    442	xor	$h2,$h2
    443
    444	lea	48+64($ctx),$ctx	# size optimization
    445
    446	mov	$r1,%rax
    447	call	__poly1305_block	# r^2
    448
    449	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
    450	mov	\$0x3ffffff,%edx
    451	mov	$h0,$d1
    452	and	$h0#d,%eax
    453	mov	$r0,$d2
    454	and	$r0#d,%edx
    455	mov	%eax,`16*0+0-64`($ctx)
    456	shr	\$26,$d1
    457	mov	%edx,`16*0+4-64`($ctx)
    458	shr	\$26,$d2
    459
    460	mov	\$0x3ffffff,%eax
    461	mov	\$0x3ffffff,%edx
    462	and	$d1#d,%eax
    463	and	$d2#d,%edx
    464	mov	%eax,`16*1+0-64`($ctx)
    465	lea	(%rax,%rax,4),%eax	# *5
    466	mov	%edx,`16*1+4-64`($ctx)
    467	lea	(%rdx,%rdx,4),%edx	# *5
    468	mov	%eax,`16*2+0-64`($ctx)
    469	shr	\$26,$d1
    470	mov	%edx,`16*2+4-64`($ctx)
    471	shr	\$26,$d2
    472
    473	mov	$h1,%rax
    474	mov	$r1,%rdx
    475	shl	\$12,%rax
    476	shl	\$12,%rdx
    477	or	$d1,%rax
    478	or	$d2,%rdx
    479	and	\$0x3ffffff,%eax
    480	and	\$0x3ffffff,%edx
    481	mov	%eax,`16*3+0-64`($ctx)
    482	lea	(%rax,%rax,4),%eax	# *5
    483	mov	%edx,`16*3+4-64`($ctx)
    484	lea	(%rdx,%rdx,4),%edx	# *5
    485	mov	%eax,`16*4+0-64`($ctx)
    486	mov	$h1,$d1
    487	mov	%edx,`16*4+4-64`($ctx)
    488	mov	$r1,$d2
    489
    490	mov	\$0x3ffffff,%eax
    491	mov	\$0x3ffffff,%edx
    492	shr	\$14,$d1
    493	shr	\$14,$d2
    494	and	$d1#d,%eax
    495	and	$d2#d,%edx
    496	mov	%eax,`16*5+0-64`($ctx)
    497	lea	(%rax,%rax,4),%eax	# *5
    498	mov	%edx,`16*5+4-64`($ctx)
    499	lea	(%rdx,%rdx,4),%edx	# *5
    500	mov	%eax,`16*6+0-64`($ctx)
    501	shr	\$26,$d1
    502	mov	%edx,`16*6+4-64`($ctx)
    503	shr	\$26,$d2
    504
    505	mov	$h2,%rax
    506	shl	\$24,%rax
    507	or	%rax,$d1
    508	mov	$d1#d,`16*7+0-64`($ctx)
    509	lea	($d1,$d1,4),$d1		# *5
    510	mov	$d2#d,`16*7+4-64`($ctx)
    511	lea	($d2,$d2,4),$d2		# *5
    512	mov	$d1#d,`16*8+0-64`($ctx)
    513	mov	$d2#d,`16*8+4-64`($ctx)
    514
    515	mov	$r1,%rax
    516	call	__poly1305_block	# r^3
    517
    518	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
    519	mov	$h0,$d1
    520	and	$h0#d,%eax
    521	shr	\$26,$d1
    522	mov	%eax,`16*0+12-64`($ctx)
    523
    524	mov	\$0x3ffffff,%edx
    525	and	$d1#d,%edx
    526	mov	%edx,`16*1+12-64`($ctx)
    527	lea	(%rdx,%rdx,4),%edx	# *5
    528	shr	\$26,$d1
    529	mov	%edx,`16*2+12-64`($ctx)
    530
    531	mov	$h1,%rax
    532	shl	\$12,%rax
    533	or	$d1,%rax
    534	and	\$0x3ffffff,%eax
    535	mov	%eax,`16*3+12-64`($ctx)
    536	lea	(%rax,%rax,4),%eax	# *5
    537	mov	$h1,$d1
    538	mov	%eax,`16*4+12-64`($ctx)
    539
    540	mov	\$0x3ffffff,%edx
    541	shr	\$14,$d1
    542	and	$d1#d,%edx
    543	mov	%edx,`16*5+12-64`($ctx)
    544	lea	(%rdx,%rdx,4),%edx	# *5
    545	shr	\$26,$d1
    546	mov	%edx,`16*6+12-64`($ctx)
    547
    548	mov	$h2,%rax
    549	shl	\$24,%rax
    550	or	%rax,$d1
    551	mov	$d1#d,`16*7+12-64`($ctx)
    552	lea	($d1,$d1,4),$d1		# *5
    553	mov	$d1#d,`16*8+12-64`($ctx)
    554
    555	mov	$r1,%rax
    556	call	__poly1305_block	# r^4
    557
    558	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
    559	mov	$h0,$d1
    560	and	$h0#d,%eax
    561	shr	\$26,$d1
    562	mov	%eax,`16*0+8-64`($ctx)
    563
    564	mov	\$0x3ffffff,%edx
    565	and	$d1#d,%edx
    566	mov	%edx,`16*1+8-64`($ctx)
    567	lea	(%rdx,%rdx,4),%edx	# *5
    568	shr	\$26,$d1
    569	mov	%edx,`16*2+8-64`($ctx)
    570
    571	mov	$h1,%rax
    572	shl	\$12,%rax
    573	or	$d1,%rax
    574	and	\$0x3ffffff,%eax
    575	mov	%eax,`16*3+8-64`($ctx)
    576	lea	(%rax,%rax,4),%eax	# *5
    577	mov	$h1,$d1
    578	mov	%eax,`16*4+8-64`($ctx)
    579
    580	mov	\$0x3ffffff,%edx
    581	shr	\$14,$d1
    582	and	$d1#d,%edx
    583	mov	%edx,`16*5+8-64`($ctx)
    584	lea	(%rdx,%rdx,4),%edx	# *5
    585	shr	\$26,$d1
    586	mov	%edx,`16*6+8-64`($ctx)
    587
    588	mov	$h2,%rax
    589	shl	\$24,%rax
    590	or	%rax,$d1
    591	mov	$d1#d,`16*7+8-64`($ctx)
    592	lea	($d1,$d1,4),$d1		# *5
    593	mov	$d1#d,`16*8+8-64`($ctx)
    594
    595	lea	-48-64($ctx),$ctx	# size [de-]optimization
    596	pop %rbp
    597	RET
    598.size	__poly1305_init_avx,.-__poly1305_init_avx
    599___
    600
    601&declare_function("poly1305_blocks_avx", 32, 4);
    602$code.=<<___;
    603.cfi_startproc
    604	mov	20($ctx),%r8d		# is_base2_26
    605	cmp	\$128,$len
    606	jae	.Lblocks_avx
    607	test	%r8d,%r8d
    608	jz	.Lblocks
    609
    610.Lblocks_avx:
    611	and	\$-16,$len
    612	jz	.Lno_data_avx
    613
    614	vzeroupper
    615
    616	test	%r8d,%r8d
    617	jz	.Lbase2_64_avx
    618
    619	test	\$31,$len
    620	jz	.Leven_avx
    621
    622	push	%rbp
    623.cfi_push	%rbp
    624	mov 	%rsp,%rbp
    625	push	%rbx
    626.cfi_push	%rbx
    627	push	%r12
    628.cfi_push	%r12
    629	push	%r13
    630.cfi_push	%r13
    631	push	%r14
    632.cfi_push	%r14
    633	push	%r15
    634.cfi_push	%r15
    635.Lblocks_avx_body:
    636
    637	mov	$len,%r15		# reassign $len
    638
    639	mov	0($ctx),$d1		# load hash value
    640	mov	8($ctx),$d2
    641	mov	16($ctx),$h2#d
    642
    643	mov	24($ctx),$r0		# load r
    644	mov	32($ctx),$s1
    645
    646	################################# base 2^26 -> base 2^64
    647	mov	$d1#d,$h0#d
    648	and	\$`-1*(1<<31)`,$d1
    649	mov	$d2,$r1			# borrow $r1
    650	mov	$d2#d,$h1#d
    651	and	\$`-1*(1<<31)`,$d2
    652
    653	shr	\$6,$d1
    654	shl	\$52,$r1
    655	add	$d1,$h0
    656	shr	\$12,$h1
    657	shr	\$18,$d2
    658	add	$r1,$h0
    659	adc	$d2,$h1
    660
    661	mov	$h2,$d1
    662	shl	\$40,$d1
    663	shr	\$24,$h2
    664	add	$d1,$h1
    665	adc	\$0,$h2			# can be partially reduced...
    666
    667	mov	\$-4,$d2		# ... so reduce
    668	mov	$h2,$d1
    669	and	$h2,$d2
    670	shr	\$2,$d1
    671	and	\$3,$h2
    672	add	$d2,$d1			# =*5
    673	add	$d1,$h0
    674	adc	\$0,$h1
    675	adc	\$0,$h2
    676
    677	mov	$s1,$r1
    678	mov	$s1,%rax
    679	shr	\$2,$s1
    680	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
    681
    682	add	0($inp),$h0		# accumulate input
    683	adc	8($inp),$h1
    684	lea	16($inp),$inp
    685	adc	$padbit,$h2
    686
    687	call	__poly1305_block
    688
    689	test	$padbit,$padbit		# if $padbit is zero,
    690	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
    691
    692	################################# base 2^64 -> base 2^26
    693	mov	$h0,%rax
    694	mov	$h0,%rdx
    695	shr	\$52,$h0
    696	mov	$h1,$r0
    697	mov	$h1,$r1
    698	shr	\$26,%rdx
    699	and	\$0x3ffffff,%rax	# h[0]
    700	shl	\$12,$r0
    701	and	\$0x3ffffff,%rdx	# h[1]
    702	shr	\$14,$h1
    703	or	$r0,$h0
    704	shl	\$24,$h2
    705	and	\$0x3ffffff,$h0		# h[2]
    706	shr	\$40,$r1
    707	and	\$0x3ffffff,$h1		# h[3]
    708	or	$r1,$h2			# h[4]
    709
    710	sub	\$16,%r15
    711	jz	.Lstore_base2_26_avx
    712
    713	vmovd	%rax#d,$H0
    714	vmovd	%rdx#d,$H1
    715	vmovd	$h0#d,$H2
    716	vmovd	$h1#d,$H3
    717	vmovd	$h2#d,$H4
    718	jmp	.Lproceed_avx
    719
    720.align	32
    721.Lstore_base2_64_avx:
    722	mov	$h0,0($ctx)
    723	mov	$h1,8($ctx)
    724	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
    725	jmp	.Ldone_avx
    726
    727.align	16
    728.Lstore_base2_26_avx:
    729	mov	%rax#d,0($ctx)		# store hash value base 2^26
    730	mov	%rdx#d,4($ctx)
    731	mov	$h0#d,8($ctx)
    732	mov	$h1#d,12($ctx)
    733	mov	$h2#d,16($ctx)
    734.align	16
    735.Ldone_avx:
    736	pop 		%r15
    737.cfi_restore	%r15
    738	pop 		%r14
    739.cfi_restore	%r14
    740	pop 		%r13
    741.cfi_restore	%r13
    742	pop 		%r12
    743.cfi_restore	%r12
    744	pop 		%rbx
    745.cfi_restore	%rbx
    746	pop 		%rbp
    747.cfi_restore	%rbp
    748.Lno_data_avx:
    749.Lblocks_avx_epilogue:
    750	RET
    751.cfi_endproc
    752
    753.align	32
    754.Lbase2_64_avx:
    755.cfi_startproc
    756	push	%rbp
    757.cfi_push	%rbp
    758	mov 	%rsp,%rbp
    759	push	%rbx
    760.cfi_push	%rbx
    761	push	%r12
    762.cfi_push	%r12
    763	push	%r13
    764.cfi_push	%r13
    765	push	%r14
    766.cfi_push	%r14
    767	push	%r15
    768.cfi_push	%r15
    769.Lbase2_64_avx_body:
    770
    771	mov	$len,%r15		# reassign $len
    772
    773	mov	24($ctx),$r0		# load r
    774	mov	32($ctx),$s1
    775
    776	mov	0($ctx),$h0		# load hash value
    777	mov	8($ctx),$h1
    778	mov	16($ctx),$h2#d
    779
    780	mov	$s1,$r1
    781	mov	$s1,%rax
    782	shr	\$2,$s1
    783	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
    784
    785	test	\$31,$len
    786	jz	.Linit_avx
    787
    788	add	0($inp),$h0		# accumulate input
    789	adc	8($inp),$h1
    790	lea	16($inp),$inp
    791	adc	$padbit,$h2
    792	sub	\$16,%r15
    793
    794	call	__poly1305_block
    795
    796.Linit_avx:
    797	################################# base 2^64 -> base 2^26
    798	mov	$h0,%rax
    799	mov	$h0,%rdx
    800	shr	\$52,$h0
    801	mov	$h1,$d1
    802	mov	$h1,$d2
    803	shr	\$26,%rdx
    804	and	\$0x3ffffff,%rax	# h[0]
    805	shl	\$12,$d1
    806	and	\$0x3ffffff,%rdx	# h[1]
    807	shr	\$14,$h1
    808	or	$d1,$h0
    809	shl	\$24,$h2
    810	and	\$0x3ffffff,$h0		# h[2]
    811	shr	\$40,$d2
    812	and	\$0x3ffffff,$h1		# h[3]
    813	or	$d2,$h2			# h[4]
    814
    815	vmovd	%rax#d,$H0
    816	vmovd	%rdx#d,$H1
    817	vmovd	$h0#d,$H2
    818	vmovd	$h1#d,$H3
    819	vmovd	$h2#d,$H4
    820	movl	\$1,20($ctx)		# set is_base2_26
    821
    822	call	__poly1305_init_avx
    823
    824.Lproceed_avx:
    825	mov	%r15,$len
    826	pop 		%r15
    827.cfi_restore	%r15
    828	pop 		%r14
    829.cfi_restore	%r14
    830	pop 		%r13
    831.cfi_restore	%r13
    832	pop 		%r12
    833.cfi_restore	%r12
    834	pop 		%rbx
    835.cfi_restore	%rbx
    836	pop 		%rbp
    837.cfi_restore	%rbp
    838.Lbase2_64_avx_epilogue:
    839	jmp	.Ldo_avx
    840.cfi_endproc
    841
    842.align	32
    843.Leven_avx:
    844.cfi_startproc
    845	vmovd		4*0($ctx),$H0		# load hash value
    846	vmovd		4*1($ctx),$H1
    847	vmovd		4*2($ctx),$H2
    848	vmovd		4*3($ctx),$H3
    849	vmovd		4*4($ctx),$H4
    850
    851.Ldo_avx:
    852___
    853$code.=<<___	if (!$win64);
    854	lea		8(%rsp),%r10
    855.cfi_def_cfa_register	%r10
    856	and		\$-32,%rsp
    857	sub		\$-8,%rsp
    858	lea		-0x58(%rsp),%r11
    859	sub		\$0x178,%rsp
    860___
    861$code.=<<___	if ($win64);
    862	lea		-0xf8(%rsp),%r11
    863	sub		\$0x218,%rsp
    864	vmovdqa		%xmm6,0x50(%r11)
    865	vmovdqa		%xmm7,0x60(%r11)
    866	vmovdqa		%xmm8,0x70(%r11)
    867	vmovdqa		%xmm9,0x80(%r11)
    868	vmovdqa		%xmm10,0x90(%r11)
    869	vmovdqa		%xmm11,0xa0(%r11)
    870	vmovdqa		%xmm12,0xb0(%r11)
    871	vmovdqa		%xmm13,0xc0(%r11)
    872	vmovdqa		%xmm14,0xd0(%r11)
    873	vmovdqa		%xmm15,0xe0(%r11)
    874.Ldo_avx_body:
    875___
    876$code.=<<___;
    877	sub		\$64,$len
    878	lea		-32($inp),%rax
    879	cmovc		%rax,$inp
    880
    881	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
    882	lea		`16*3+64`($ctx),$ctx	# size optimization
    883	lea		.Lconst(%rip),%rcx
    884
    885	################################################################
    886	# load input
    887	vmovdqu		16*2($inp),$T0
    888	vmovdqu		16*3($inp),$T1
    889	vmovdqa		64(%rcx),$MASK		# .Lmask26
    890
    891	vpsrldq		\$6,$T0,$T2		# splat input
    892	vpsrldq		\$6,$T1,$T3
    893	vpunpckhqdq	$T1,$T0,$T4		# 4
    894	vpunpcklqdq	$T1,$T0,$T0		# 0:1
    895	vpunpcklqdq	$T3,$T2,$T3		# 2:3
    896
    897	vpsrlq		\$40,$T4,$T4		# 4
    898	vpsrlq		\$26,$T0,$T1
    899	vpand		$MASK,$T0,$T0		# 0
    900	vpsrlq		\$4,$T3,$T2
    901	vpand		$MASK,$T1,$T1		# 1
    902	vpsrlq		\$30,$T3,$T3
    903	vpand		$MASK,$T2,$T2		# 2
    904	vpand		$MASK,$T3,$T3		# 3
    905	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
    906
    907	jbe		.Lskip_loop_avx
    908
    909	# expand and copy pre-calculated table to stack
    910	vmovdqu		`16*1-64`($ctx),$D1
    911	vmovdqu		`16*2-64`($ctx),$D2
    912	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
    913	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
    914	vmovdqa		$D3,-0x90(%r11)
    915	vmovdqa		$D0,0x00(%rsp)
    916	vpshufd		\$0xEE,$D1,$D4
    917	vmovdqu		`16*3-64`($ctx),$D0
    918	vpshufd		\$0x44,$D1,$D1
    919	vmovdqa		$D4,-0x80(%r11)
    920	vmovdqa		$D1,0x10(%rsp)
    921	vpshufd		\$0xEE,$D2,$D3
    922	vmovdqu		`16*4-64`($ctx),$D1
    923	vpshufd		\$0x44,$D2,$D2
    924	vmovdqa		$D3,-0x70(%r11)
    925	vmovdqa		$D2,0x20(%rsp)
    926	vpshufd		\$0xEE,$D0,$D4
    927	vmovdqu		`16*5-64`($ctx),$D2
    928	vpshufd		\$0x44,$D0,$D0
    929	vmovdqa		$D4,-0x60(%r11)
    930	vmovdqa		$D0,0x30(%rsp)
    931	vpshufd		\$0xEE,$D1,$D3
    932	vmovdqu		`16*6-64`($ctx),$D0
    933	vpshufd		\$0x44,$D1,$D1
    934	vmovdqa		$D3,-0x50(%r11)
    935	vmovdqa		$D1,0x40(%rsp)
    936	vpshufd		\$0xEE,$D2,$D4
    937	vmovdqu		`16*7-64`($ctx),$D1
    938	vpshufd		\$0x44,$D2,$D2
    939	vmovdqa		$D4,-0x40(%r11)
    940	vmovdqa		$D2,0x50(%rsp)
    941	vpshufd		\$0xEE,$D0,$D3
    942	vmovdqu		`16*8-64`($ctx),$D2
    943	vpshufd		\$0x44,$D0,$D0
    944	vmovdqa		$D3,-0x30(%r11)
    945	vmovdqa		$D0,0x60(%rsp)
    946	vpshufd		\$0xEE,$D1,$D4
    947	vpshufd		\$0x44,$D1,$D1
    948	vmovdqa		$D4,-0x20(%r11)
    949	vmovdqa		$D1,0x70(%rsp)
    950	vpshufd		\$0xEE,$D2,$D3
    951	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
    952	vpshufd		\$0x44,$D2,$D2
    953	vmovdqa		$D3,-0x10(%r11)
    954	vmovdqa		$D2,0x80(%rsp)
    955
    956	jmp		.Loop_avx
    957
    958.align	32
    959.Loop_avx:
    960	################################################################
    961	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
    962	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
    963	#   \___________________/
    964	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
    965	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
    966	#   \___________________/ \____________________/
    967	#
    968	# Note that we start with inp[2:3]*r^2. This is because it
    969	# doesn't depend on reduction in previous iteration.
    970	################################################################
    971	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
    972	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
    973	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
    974	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
    975	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
    976	#
    977	# though note that $Tx and $Hx are "reversed" in this section,
    978	# and $D4 is preloaded with r0^2...
    979
    980	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
    981	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
    982	  vmovdqa	$H2,0x20(%r11)				# offload hash
    983	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
    984	 vmovdqa	0x10(%rsp),$H2		# r1^2
    985	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
    986	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
    987
    988	  vmovdqa	$H0,0x00(%r11)				#
    989	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
    990	  vmovdqa	$H1,0x10(%r11)				#
    991	vpmuludq	$T3,$H2,$H1		# h3*r1
    992	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
    993	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
    994	  vmovdqa	$H3,0x30(%r11)				#
    995	vpmuludq	$T2,$H2,$H0		# h2*r1
    996	vpmuludq	$T1,$H2,$H1		# h1*r1
    997	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
    998	 vmovdqa	0x30(%rsp),$H3		# r2^2
    999	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
   1000	  vmovdqa	$H4,0x40(%r11)				#
   1001	vpmuludq	$T0,$H2,$H2		# h0*r1
   1002	 vpmuludq	$T2,$H3,$H0		# h2*r2
   1003	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
   1004
   1005	 vmovdqa	0x40(%rsp),$H4		# s2^2
   1006	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
   1007	vpmuludq	$T1,$H3,$H1		# h1*r2
   1008	vpmuludq	$T0,$H3,$H3		# h0*r2
   1009	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
   1010	 vmovdqa	0x50(%rsp),$H2		# r3^2
   1011	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
   1012	vpmuludq	$T4,$H4,$H0		# h4*s2
   1013	vpmuludq	$T3,$H4,$H4		# h3*s2
   1014	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
   1015	 vmovdqa	0x60(%rsp),$H3		# s3^2
   1016	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
   1017
   1018	 vmovdqa	0x80(%rsp),$H4		# s4^2
   1019	vpmuludq	$T1,$H2,$H1		# h1*r3
   1020	vpmuludq	$T0,$H2,$H2		# h0*r3
   1021	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
   1022	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
   1023	vpmuludq	$T4,$H3,$H0		# h4*s3
   1024	vpmuludq	$T3,$H3,$H1		# h3*s3
   1025	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
   1026	 vmovdqu	16*0($inp),$H0				# load input
   1027	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
   1028	vpmuludq	$T2,$H3,$H3		# h2*s3
   1029	 vpmuludq	$T2,$H4,$T2		# h2*s4
   1030	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
   1031
   1032	 vmovdqu	16*1($inp),$H1				#
   1033	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
   1034	vpmuludq	$T3,$H4,$T3		# h3*s4
   1035	vpmuludq	$T4,$H4,$T4		# h4*s4
   1036	 vpsrldq	\$6,$H0,$H2				# splat input
   1037	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
   1038	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
   1039	 vpsrldq	\$6,$H1,$H3				#
   1040	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
   1041	vpmuludq	$T1,$H4,$T0		# h1*s4
   1042	 vpunpckhqdq	$H1,$H0,$H4		# 4
   1043	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
   1044	 vmovdqa	-0x90(%r11),$T4		# r0^4
   1045	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
   1046
   1047	vpunpcklqdq	$H1,$H0,$H0		# 0:1
   1048	vpunpcklqdq	$H3,$H2,$H3		# 2:3
   1049
   1050	#vpsrlq		\$40,$H4,$H4		# 4
   1051	vpsrldq		\$`40/8`,$H4,$H4	# 4
   1052	vpsrlq		\$26,$H0,$H1
   1053	vpand		$MASK,$H0,$H0		# 0
   1054	vpsrlq		\$4,$H3,$H2
   1055	vpand		$MASK,$H1,$H1		# 1
   1056	vpand		0(%rcx),$H4,$H4		# .Lmask24
   1057	vpsrlq		\$30,$H3,$H3
   1058	vpand		$MASK,$H2,$H2		# 2
   1059	vpand		$MASK,$H3,$H3		# 3
   1060	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
   1061
   1062	vpaddq		0x00(%r11),$H0,$H0	# add hash value
   1063	vpaddq		0x10(%r11),$H1,$H1
   1064	vpaddq		0x20(%r11),$H2,$H2
   1065	vpaddq		0x30(%r11),$H3,$H3
   1066	vpaddq		0x40(%r11),$H4,$H4
   1067
   1068	lea		16*2($inp),%rax
   1069	lea		16*4($inp),$inp
   1070	sub		\$64,$len
   1071	cmovc		%rax,$inp
   1072
   1073	################################################################
   1074	# Now we accumulate (inp[0:1]+hash)*r^4
   1075	################################################################
   1076	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
   1077	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
   1078	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
   1079	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
   1080	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
   1081
   1082	vpmuludq	$H0,$T4,$T0		# h0*r0
   1083	vpmuludq	$H1,$T4,$T1		# h1*r0
   1084	vpaddq		$T0,$D0,$D0
   1085	vpaddq		$T1,$D1,$D1
   1086	 vmovdqa	-0x80(%r11),$T2		# r1^4
   1087	vpmuludq	$H2,$T4,$T0		# h2*r0
   1088	vpmuludq	$H3,$T4,$T1		# h3*r0
   1089	vpaddq		$T0,$D2,$D2
   1090	vpaddq		$T1,$D3,$D3
   1091	vpmuludq	$H4,$T4,$T4		# h4*r0
   1092	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
   1093	vpaddq		$T4,$D4,$D4
   1094
   1095	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
   1096	vpmuludq	$H2,$T2,$T1		# h2*r1
   1097	vpmuludq	$H3,$T2,$T0		# h3*r1
   1098	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
   1099	 vmovdqa	-0x60(%r11),$T3		# r2^4
   1100	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
   1101	vpmuludq	$H1,$T2,$T1		# h1*r1
   1102	vpmuludq	$H0,$T2,$T2		# h0*r1
   1103	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
   1104	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
   1105
   1106	 vmovdqa	-0x50(%r11),$T4		# s2^4
   1107	vpmuludq	$H2,$T3,$T0		# h2*r2
   1108	vpmuludq	$H1,$T3,$T1		# h1*r2
   1109	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
   1110	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
   1111	 vmovdqa	-0x40(%r11),$T2		# r3^4
   1112	vpmuludq	$H0,$T3,$T3		# h0*r2
   1113	vpmuludq	$H4,$T4,$T0		# h4*s2
   1114	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
   1115	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
   1116	 vmovdqa	-0x30(%r11),$T3		# s3^4
   1117	vpmuludq	$H3,$T4,$T4		# h3*s2
   1118	 vpmuludq	$H1,$T2,$T1		# h1*r3
   1119	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
   1120
   1121	 vmovdqa	-0x10(%r11),$T4		# s4^4
   1122	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
   1123	vpmuludq	$H0,$T2,$T2		# h0*r3
   1124	vpmuludq	$H4,$T3,$T0		# h4*s3
   1125	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
   1126	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
   1127	 vmovdqu	16*2($inp),$T0				# load input
   1128	vpmuludq	$H3,$T3,$T2		# h3*s3
   1129	vpmuludq	$H2,$T3,$T3		# h2*s3
   1130	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
   1131	 vmovdqu	16*3($inp),$T1				#
   1132	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
   1133
   1134	vpmuludq	$H2,$T4,$H2		# h2*s4
   1135	vpmuludq	$H3,$T4,$H3		# h3*s4
   1136	 vpsrldq	\$6,$T0,$T2				# splat input
   1137	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
   1138	vpmuludq	$H4,$T4,$H4		# h4*s4
   1139	 vpsrldq	\$6,$T1,$T3				#
   1140	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
   1141	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
   1142	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
   1143	vpmuludq	$H1,$T4,$H0
   1144	 vpunpckhqdq	$T1,$T0,$T4		# 4
   1145	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
   1146	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
   1147
   1148	vpunpcklqdq	$T1,$T0,$T0		# 0:1
   1149	vpunpcklqdq	$T3,$T2,$T3		# 2:3
   1150
   1151	#vpsrlq		\$40,$T4,$T4		# 4
   1152	vpsrldq		\$`40/8`,$T4,$T4	# 4
   1153	vpsrlq		\$26,$T0,$T1
   1154	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
   1155	vpand		$MASK,$T0,$T0		# 0
   1156	vpsrlq		\$4,$T3,$T2
   1157	vpand		$MASK,$T1,$T1		# 1
   1158	vpand		0(%rcx),$T4,$T4		# .Lmask24
   1159	vpsrlq		\$30,$T3,$T3
   1160	vpand		$MASK,$T2,$T2		# 2
   1161	vpand		$MASK,$T3,$T3		# 3
   1162	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
   1163
   1164	################################################################
   1165	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
   1166	# and P. Schwabe
   1167
   1168	vpsrlq		\$26,$H3,$D3
   1169	vpand		$MASK,$H3,$H3
   1170	vpaddq		$D3,$H4,$H4		# h3 -> h4
   1171
   1172	vpsrlq		\$26,$H0,$D0
   1173	vpand		$MASK,$H0,$H0
   1174	vpaddq		$D0,$D1,$H1		# h0 -> h1
   1175
   1176	vpsrlq		\$26,$H4,$D0
   1177	vpand		$MASK,$H4,$H4
   1178
   1179	vpsrlq		\$26,$H1,$D1
   1180	vpand		$MASK,$H1,$H1
   1181	vpaddq		$D1,$H2,$H2		# h1 -> h2
   1182
   1183	vpaddq		$D0,$H0,$H0
   1184	vpsllq		\$2,$D0,$D0
   1185	vpaddq		$D0,$H0,$H0		# h4 -> h0
   1186
   1187	vpsrlq		\$26,$H2,$D2
   1188	vpand		$MASK,$H2,$H2
   1189	vpaddq		$D2,$H3,$H3		# h2 -> h3
   1190
   1191	vpsrlq		\$26,$H0,$D0
   1192	vpand		$MASK,$H0,$H0
   1193	vpaddq		$D0,$H1,$H1		# h0 -> h1
   1194
   1195	vpsrlq		\$26,$H3,$D3
   1196	vpand		$MASK,$H3,$H3
   1197	vpaddq		$D3,$H4,$H4		# h3 -> h4
   1198
   1199	ja		.Loop_avx
   1200
   1201.Lskip_loop_avx:
   1202	################################################################
   1203	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
   1204
   1205	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
   1206	add		\$32,$len
   1207	jnz		.Long_tail_avx
   1208
   1209	vpaddq		$H2,$T2,$T2
   1210	vpaddq		$H0,$T0,$T0
   1211	vpaddq		$H1,$T1,$T1
   1212	vpaddq		$H3,$T3,$T3
   1213	vpaddq		$H4,$T4,$T4
   1214
   1215.Long_tail_avx:
   1216	vmovdqa		$H2,0x20(%r11)
   1217	vmovdqa		$H0,0x00(%r11)
   1218	vmovdqa		$H1,0x10(%r11)
   1219	vmovdqa		$H3,0x30(%r11)
   1220	vmovdqa		$H4,0x40(%r11)
   1221
   1222	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
   1223	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
   1224	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
   1225	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
   1226	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
   1227
   1228	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
   1229	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
   1230	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
   1231	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
   1232	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
   1233	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
   1234
   1235	vpmuludq	$T3,$H2,$H0		# h3*r1
   1236	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
   1237	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
   1238	vpmuludq	$T2,$H2,$H1		# h2*r1
   1239	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
   1240	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
   1241	vpmuludq	$T1,$H2,$H0		# h1*r1
   1242	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
   1243	vpmuludq	$T0,$H2,$H2		# h0*r1
   1244	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
   1245	vpmuludq	$T4,$H3,$H3		# h4*s1
   1246	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
   1247
   1248	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
   1249	vpmuludq	$T2,$H4,$H1		# h2*r2
   1250	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
   1251	vpmuludq	$T1,$H4,$H0		# h1*r2
   1252	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
   1253	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
   1254	vpmuludq	$T0,$H4,$H4		# h0*r2
   1255	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
   1256	vpmuludq	$T4,$H2,$H1		# h4*s2
   1257	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
   1258	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
   1259	vpmuludq	$T3,$H2,$H2		# h3*s2
   1260	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
   1261
   1262	vpmuludq	$T1,$H3,$H0		# h1*r3
   1263	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
   1264	vpmuludq	$T0,$H3,$H3		# h0*r3
   1265	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
   1266	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
   1267	vpmuludq	$T4,$H4,$H1		# h4*s3
   1268	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
   1269	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
   1270	vpmuludq	$T3,$H4,$H0		# h3*s3
   1271	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
   1272	vpmuludq	$T2,$H4,$H4		# h2*s3
   1273	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
   1274
   1275	vpmuludq	$T0,$H2,$H2		# h0*r4
   1276	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
   1277	vpmuludq	$T4,$H3,$H1		# h4*s4
   1278	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
   1279	vpmuludq	$T3,$H3,$H0		# h3*s4
   1280	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
   1281	vpmuludq	$T2,$H3,$H1		# h2*s4
   1282	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
   1283	vpmuludq	$T1,$H3,$H3		# h1*s4
   1284	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
   1285
   1286	jz		.Lshort_tail_avx
   1287
   1288	vmovdqu		16*0($inp),$H0		# load input
   1289	vmovdqu		16*1($inp),$H1
   1290
   1291	vpsrldq		\$6,$H0,$H2		# splat input
   1292	vpsrldq		\$6,$H1,$H3
   1293	vpunpckhqdq	$H1,$H0,$H4		# 4
   1294	vpunpcklqdq	$H1,$H0,$H0		# 0:1
   1295	vpunpcklqdq	$H3,$H2,$H3		# 2:3
   1296
   1297	vpsrlq		\$40,$H4,$H4		# 4
   1298	vpsrlq		\$26,$H0,$H1
   1299	vpand		$MASK,$H0,$H0		# 0
   1300	vpsrlq		\$4,$H3,$H2
   1301	vpand		$MASK,$H1,$H1		# 1
   1302	vpsrlq		\$30,$H3,$H3
   1303	vpand		$MASK,$H2,$H2		# 2
   1304	vpand		$MASK,$H3,$H3		# 3
   1305	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
   1306
   1307	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
   1308	vpaddq		0x00(%r11),$H0,$H0
   1309	vpaddq		0x10(%r11),$H1,$H1
   1310	vpaddq		0x20(%r11),$H2,$H2
   1311	vpaddq		0x30(%r11),$H3,$H3
   1312	vpaddq		0x40(%r11),$H4,$H4
   1313
   1314	################################################################
   1315	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
   1316
   1317	vpmuludq	$H0,$T4,$T0		# h0*r0
   1318	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
   1319	vpmuludq	$H1,$T4,$T1		# h1*r0
   1320	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
   1321	vpmuludq	$H2,$T4,$T0		# h2*r0
   1322	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
   1323	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
   1324	vpmuludq	$H3,$T4,$T1		# h3*r0
   1325	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
   1326	vpmuludq	$H4,$T4,$T4		# h4*r0
   1327	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
   1328
   1329	vpmuludq	$H3,$T2,$T0		# h3*r1
   1330	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
   1331	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
   1332	vpmuludq	$H2,$T2,$T1		# h2*r1
   1333	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
   1334	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
   1335	vpmuludq	$H1,$T2,$T0		# h1*r1
   1336	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
   1337	vpmuludq	$H0,$T2,$T2		# h0*r1
   1338	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
   1339	vpmuludq	$H4,$T3,$T3		# h4*s1
   1340	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
   1341
   1342	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
   1343	vpmuludq	$H2,$T4,$T1		# h2*r2
   1344	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
   1345	vpmuludq	$H1,$T4,$T0		# h1*r2
   1346	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
   1347	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
   1348	vpmuludq	$H0,$T4,$T4		# h0*r2
   1349	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
   1350	vpmuludq	$H4,$T2,$T1		# h4*s2
   1351	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
   1352	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
   1353	vpmuludq	$H3,$T2,$T2		# h3*s2
   1354	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
   1355
   1356	vpmuludq	$H1,$T3,$T0		# h1*r3
   1357	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
   1358	vpmuludq	$H0,$T3,$T3		# h0*r3
   1359	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
   1360	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
   1361	vpmuludq	$H4,$T4,$T1		# h4*s3
   1362	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
   1363	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
   1364	vpmuludq	$H3,$T4,$T0		# h3*s3
   1365	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
   1366	vpmuludq	$H2,$T4,$T4		# h2*s3
   1367	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
   1368
   1369	vpmuludq	$H0,$T2,$T2		# h0*r4
   1370	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
   1371	vpmuludq	$H4,$T3,$T1		# h4*s4
   1372	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
   1373	vpmuludq	$H3,$T3,$T0		# h3*s4
   1374	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
   1375	vpmuludq	$H2,$T3,$T1		# h2*s4
   1376	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
   1377	vpmuludq	$H1,$T3,$T3		# h1*s4
   1378	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
   1379
   1380.Lshort_tail_avx:
   1381	################################################################
   1382	# horizontal addition
   1383
   1384	vpsrldq		\$8,$D4,$T4
   1385	vpsrldq		\$8,$D3,$T3
   1386	vpsrldq		\$8,$D1,$T1
   1387	vpsrldq		\$8,$D0,$T0
   1388	vpsrldq		\$8,$D2,$T2
   1389	vpaddq		$T3,$D3,$D3
   1390	vpaddq		$T4,$D4,$D4
   1391	vpaddq		$T0,$D0,$D0
   1392	vpaddq		$T1,$D1,$D1
   1393	vpaddq		$T2,$D2,$D2
   1394
   1395	################################################################
   1396	# lazy reduction
   1397
   1398	vpsrlq		\$26,$D3,$H3
   1399	vpand		$MASK,$D3,$D3
   1400	vpaddq		$H3,$D4,$D4		# h3 -> h4
   1401
   1402	vpsrlq		\$26,$D0,$H0
   1403	vpand		$MASK,$D0,$D0
   1404	vpaddq		$H0,$D1,$D1		# h0 -> h1
   1405
   1406	vpsrlq		\$26,$D4,$H4
   1407	vpand		$MASK,$D4,$D4
   1408
   1409	vpsrlq		\$26,$D1,$H1
   1410	vpand		$MASK,$D1,$D1
   1411	vpaddq		$H1,$D2,$D2		# h1 -> h2
   1412
   1413	vpaddq		$H4,$D0,$D0
   1414	vpsllq		\$2,$H4,$H4
   1415	vpaddq		$H4,$D0,$D0		# h4 -> h0
   1416
   1417	vpsrlq		\$26,$D2,$H2
   1418	vpand		$MASK,$D2,$D2
   1419	vpaddq		$H2,$D3,$D3		# h2 -> h3
   1420
   1421	vpsrlq		\$26,$D0,$H0
   1422	vpand		$MASK,$D0,$D0
   1423	vpaddq		$H0,$D1,$D1		# h0 -> h1
   1424
   1425	vpsrlq		\$26,$D3,$H3
   1426	vpand		$MASK,$D3,$D3
   1427	vpaddq		$H3,$D4,$D4		# h3 -> h4
   1428
   1429	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
   1430	vmovd		$D1,`4*1-48-64`($ctx)
   1431	vmovd		$D2,`4*2-48-64`($ctx)
   1432	vmovd		$D3,`4*3-48-64`($ctx)
   1433	vmovd		$D4,`4*4-48-64`($ctx)
   1434___
   1435$code.=<<___	if ($win64);
   1436	vmovdqa		0x50(%r11),%xmm6
   1437	vmovdqa		0x60(%r11),%xmm7
   1438	vmovdqa		0x70(%r11),%xmm8
   1439	vmovdqa		0x80(%r11),%xmm9
   1440	vmovdqa		0x90(%r11),%xmm10
   1441	vmovdqa		0xa0(%r11),%xmm11
   1442	vmovdqa		0xb0(%r11),%xmm12
   1443	vmovdqa		0xc0(%r11),%xmm13
   1444	vmovdqa		0xd0(%r11),%xmm14
   1445	vmovdqa		0xe0(%r11),%xmm15
   1446	lea		0xf8(%r11),%rsp
   1447.Ldo_avx_epilogue:
   1448___
   1449$code.=<<___	if (!$win64);
   1450	lea		-8(%r10),%rsp
   1451.cfi_def_cfa_register	%rsp
   1452___
   1453$code.=<<___;
   1454	vzeroupper
   1455	RET
   1456.cfi_endproc
   1457___
   1458&end_function("poly1305_blocks_avx");
   1459
   1460&declare_function("poly1305_emit_avx", 32, 3);
   1461$code.=<<___;
   1462	cmpl	\$0,20($ctx)	# is_base2_26?
   1463	je	.Lemit
   1464
   1465	mov	0($ctx),%eax	# load hash value base 2^26
   1466	mov	4($ctx),%ecx
   1467	mov	8($ctx),%r8d
   1468	mov	12($ctx),%r11d
   1469	mov	16($ctx),%r10d
   1470
   1471	shl	\$26,%rcx	# base 2^26 -> base 2^64
   1472	mov	%r8,%r9
   1473	shl	\$52,%r8
   1474	add	%rcx,%rax
   1475	shr	\$12,%r9
   1476	add	%rax,%r8	# h0
   1477	adc	\$0,%r9
   1478
   1479	shl	\$14,%r11
   1480	mov	%r10,%rax
   1481	shr	\$24,%r10
   1482	add	%r11,%r9
   1483	shl	\$40,%rax
   1484	add	%rax,%r9	# h1
   1485	adc	\$0,%r10	# h2
   1486
   1487	mov	%r10,%rax	# could be partially reduced, so reduce
   1488	mov	%r10,%rcx
   1489	and	\$3,%r10
   1490	shr	\$2,%rax
   1491	and	\$-4,%rcx
   1492	add	%rcx,%rax
   1493	add	%rax,%r8
   1494	adc	\$0,%r9
   1495	adc	\$0,%r10
   1496
   1497	mov	%r8,%rax
   1498	add	\$5,%r8		# compare to modulus
   1499	mov	%r9,%rcx
   1500	adc	\$0,%r9
   1501	adc	\$0,%r10
   1502	shr	\$2,%r10	# did 130-bit value overflow?
   1503	cmovnz	%r8,%rax
   1504	cmovnz	%r9,%rcx
   1505
   1506	add	0($nonce),%rax	# accumulate nonce
   1507	adc	8($nonce),%rcx
   1508	mov	%rax,0($mac)	# write result
   1509	mov	%rcx,8($mac)
   1510
   1511	RET
   1512___
   1513&end_function("poly1305_emit_avx");
   1514
   1515if ($avx>1) {
   1516
   1517my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
   1518    map("%ymm$_",(0..15));
   1519my $S4=$MASK;
   1520
   1521sub poly1305_blocks_avxN {
   1522	my ($avx512) = @_;
   1523	my $suffix = $avx512 ? "_avx512" : "";
   1524$code.=<<___;
   1525.cfi_startproc
   1526	mov	20($ctx),%r8d		# is_base2_26
   1527	cmp	\$128,$len
   1528	jae	.Lblocks_avx2$suffix
   1529	test	%r8d,%r8d
   1530	jz	.Lblocks
   1531
   1532.Lblocks_avx2$suffix:
   1533	and	\$-16,$len
   1534	jz	.Lno_data_avx2$suffix
   1535
   1536	vzeroupper
   1537
   1538	test	%r8d,%r8d
   1539	jz	.Lbase2_64_avx2$suffix
   1540
   1541	test	\$63,$len
   1542	jz	.Leven_avx2$suffix
   1543
   1544	push	%rbp
   1545.cfi_push	%rbp
   1546	mov 	%rsp,%rbp
   1547	push	%rbx
   1548.cfi_push	%rbx
   1549	push	%r12
   1550.cfi_push	%r12
   1551	push	%r13
   1552.cfi_push	%r13
   1553	push	%r14
   1554.cfi_push	%r14
   1555	push	%r15
   1556.cfi_push	%r15
   1557.Lblocks_avx2_body$suffix:
   1558
   1559	mov	$len,%r15		# reassign $len
   1560
   1561	mov	0($ctx),$d1		# load hash value
   1562	mov	8($ctx),$d2
   1563	mov	16($ctx),$h2#d
   1564
   1565	mov	24($ctx),$r0		# load r
   1566	mov	32($ctx),$s1
   1567
   1568	################################# base 2^26 -> base 2^64
   1569	mov	$d1#d,$h0#d
   1570	and	\$`-1*(1<<31)`,$d1
   1571	mov	$d2,$r1			# borrow $r1
   1572	mov	$d2#d,$h1#d
   1573	and	\$`-1*(1<<31)`,$d2
   1574
   1575	shr	\$6,$d1
   1576	shl	\$52,$r1
   1577	add	$d1,$h0
   1578	shr	\$12,$h1
   1579	shr	\$18,$d2
   1580	add	$r1,$h0
   1581	adc	$d2,$h1
   1582
   1583	mov	$h2,$d1
   1584	shl	\$40,$d1
   1585	shr	\$24,$h2
   1586	add	$d1,$h1
   1587	adc	\$0,$h2			# can be partially reduced...
   1588
   1589	mov	\$-4,$d2		# ... so reduce
   1590	mov	$h2,$d1
   1591	and	$h2,$d2
   1592	shr	\$2,$d1
   1593	and	\$3,$h2
   1594	add	$d2,$d1			# =*5
   1595	add	$d1,$h0
   1596	adc	\$0,$h1
   1597	adc	\$0,$h2
   1598
   1599	mov	$s1,$r1
   1600	mov	$s1,%rax
   1601	shr	\$2,$s1
   1602	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
   1603
   1604.Lbase2_26_pre_avx2$suffix:
   1605	add	0($inp),$h0		# accumulate input
   1606	adc	8($inp),$h1
   1607	lea	16($inp),$inp
   1608	adc	$padbit,$h2
   1609	sub	\$16,%r15
   1610
   1611	call	__poly1305_block
   1612	mov	$r1,%rax
   1613
   1614	test	\$63,%r15
   1615	jnz	.Lbase2_26_pre_avx2$suffix
   1616
   1617	test	$padbit,$padbit		# if $padbit is zero,
   1618	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
   1619
   1620	################################# base 2^64 -> base 2^26
   1621	mov	$h0,%rax
   1622	mov	$h0,%rdx
   1623	shr	\$52,$h0
   1624	mov	$h1,$r0
   1625	mov	$h1,$r1
   1626	shr	\$26,%rdx
   1627	and	\$0x3ffffff,%rax	# h[0]
   1628	shl	\$12,$r0
   1629	and	\$0x3ffffff,%rdx	# h[1]
   1630	shr	\$14,$h1
   1631	or	$r0,$h0
   1632	shl	\$24,$h2
   1633	and	\$0x3ffffff,$h0		# h[2]
   1634	shr	\$40,$r1
   1635	and	\$0x3ffffff,$h1		# h[3]
   1636	or	$r1,$h2			# h[4]
   1637
   1638	test	%r15,%r15
   1639	jz	.Lstore_base2_26_avx2$suffix
   1640
   1641	vmovd	%rax#d,%x#$H0
   1642	vmovd	%rdx#d,%x#$H1
   1643	vmovd	$h0#d,%x#$H2
   1644	vmovd	$h1#d,%x#$H3
   1645	vmovd	$h2#d,%x#$H4
   1646	jmp	.Lproceed_avx2$suffix
   1647
   1648.align	32
   1649.Lstore_base2_64_avx2$suffix:
   1650	mov	$h0,0($ctx)
   1651	mov	$h1,8($ctx)
   1652	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
   1653	jmp	.Ldone_avx2$suffix
   1654
   1655.align	16
   1656.Lstore_base2_26_avx2$suffix:
   1657	mov	%rax#d,0($ctx)		# store hash value base 2^26
   1658	mov	%rdx#d,4($ctx)
   1659	mov	$h0#d,8($ctx)
   1660	mov	$h1#d,12($ctx)
   1661	mov	$h2#d,16($ctx)
   1662.align	16
   1663.Ldone_avx2$suffix:
   1664	pop 		%r15
   1665.cfi_restore	%r15
   1666	pop 		%r14
   1667.cfi_restore	%r14
   1668	pop 		%r13
   1669.cfi_restore	%r13
   1670	pop 		%r12
   1671.cfi_restore	%r12
   1672	pop 		%rbx
   1673.cfi_restore	%rbx
   1674	pop 		%rbp
   1675.cfi_restore 	%rbp
   1676.Lno_data_avx2$suffix:
   1677.Lblocks_avx2_epilogue$suffix:
   1678	RET
   1679.cfi_endproc
   1680
   1681.align	32
   1682.Lbase2_64_avx2$suffix:
   1683.cfi_startproc
   1684	push	%rbp
   1685.cfi_push	%rbp
   1686	mov 	%rsp,%rbp
   1687	push	%rbx
   1688.cfi_push	%rbx
   1689	push	%r12
   1690.cfi_push	%r12
   1691	push	%r13
   1692.cfi_push	%r13
   1693	push	%r14
   1694.cfi_push	%r14
   1695	push	%r15
   1696.cfi_push	%r15
   1697.Lbase2_64_avx2_body$suffix:
   1698
   1699	mov	$len,%r15		# reassign $len
   1700
   1701	mov	24($ctx),$r0		# load r
   1702	mov	32($ctx),$s1
   1703
   1704	mov	0($ctx),$h0		# load hash value
   1705	mov	8($ctx),$h1
   1706	mov	16($ctx),$h2#d
   1707
   1708	mov	$s1,$r1
   1709	mov	$s1,%rax
   1710	shr	\$2,$s1
   1711	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
   1712
   1713	test	\$63,$len
   1714	jz	.Linit_avx2$suffix
   1715
   1716.Lbase2_64_pre_avx2$suffix:
   1717	add	0($inp),$h0		# accumulate input
   1718	adc	8($inp),$h1
   1719	lea	16($inp),$inp
   1720	adc	$padbit,$h2
   1721	sub	\$16,%r15
   1722
   1723	call	__poly1305_block
   1724	mov	$r1,%rax
   1725
   1726	test	\$63,%r15
   1727	jnz	.Lbase2_64_pre_avx2$suffix
   1728
   1729.Linit_avx2$suffix:
   1730	################################# base 2^64 -> base 2^26
   1731	mov	$h0,%rax
   1732	mov	$h0,%rdx
   1733	shr	\$52,$h0
   1734	mov	$h1,$d1
   1735	mov	$h1,$d2
   1736	shr	\$26,%rdx
   1737	and	\$0x3ffffff,%rax	# h[0]
   1738	shl	\$12,$d1
   1739	and	\$0x3ffffff,%rdx	# h[1]
   1740	shr	\$14,$h1
   1741	or	$d1,$h0
   1742	shl	\$24,$h2
   1743	and	\$0x3ffffff,$h0		# h[2]
   1744	shr	\$40,$d2
   1745	and	\$0x3ffffff,$h1		# h[3]
   1746	or	$d2,$h2			# h[4]
   1747
   1748	vmovd	%rax#d,%x#$H0
   1749	vmovd	%rdx#d,%x#$H1
   1750	vmovd	$h0#d,%x#$H2
   1751	vmovd	$h1#d,%x#$H3
   1752	vmovd	$h2#d,%x#$H4
   1753	movl	\$1,20($ctx)		# set is_base2_26
   1754
   1755	call	__poly1305_init_avx
   1756
   1757.Lproceed_avx2$suffix:
   1758	mov	%r15,$len			# restore $len
   1759___
   1760$code.=<<___ if (!$kernel);
   1761	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
   1762	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
   1763___
   1764$code.=<<___;
   1765	pop 		%r15
   1766.cfi_restore	%r15
   1767	pop 		%r14
   1768.cfi_restore	%r14
   1769	pop 		%r13
   1770.cfi_restore	%r13
   1771	pop 		%r12
   1772.cfi_restore	%r12
   1773	pop 		%rbx
   1774.cfi_restore	%rbx
   1775	pop 		%rbp
   1776.cfi_restore 	%rbp
   1777.Lbase2_64_avx2_epilogue$suffix:
   1778	jmp	.Ldo_avx2$suffix
   1779.cfi_endproc
   1780
   1781.align	32
   1782.Leven_avx2$suffix:
   1783.cfi_startproc
   1784___
   1785$code.=<<___ if (!$kernel);
   1786	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
   1787___
   1788$code.=<<___;
   1789	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
   1790	vmovd		4*1($ctx),%x#$H1
   1791	vmovd		4*2($ctx),%x#$H2
   1792	vmovd		4*3($ctx),%x#$H3
   1793	vmovd		4*4($ctx),%x#$H4
   1794
   1795.Ldo_avx2$suffix:
   1796___
   1797$code.=<<___		if (!$kernel && $avx>2);
   1798	cmp		\$512,$len
   1799	jb		.Lskip_avx512
   1800	and		%r11d,%r9d
   1801	test		\$`1<<16`,%r9d		# check for AVX512F
   1802	jnz		.Lblocks_avx512
   1803.Lskip_avx512$suffix:
   1804___
   1805$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
   1806	cmp		\$512,$len
   1807	jae		.Lblocks_avx512
   1808___
   1809$code.=<<___	if (!$win64);
   1810	lea		8(%rsp),%r10
   1811.cfi_def_cfa_register	%r10
   1812	sub		\$0x128,%rsp
   1813___
   1814$code.=<<___	if ($win64);
   1815	lea		8(%rsp),%r10
   1816	sub		\$0x1c8,%rsp
   1817	vmovdqa		%xmm6,-0xb0(%r10)
   1818	vmovdqa		%xmm7,-0xa0(%r10)
   1819	vmovdqa		%xmm8,-0x90(%r10)
   1820	vmovdqa		%xmm9,-0x80(%r10)
   1821	vmovdqa		%xmm10,-0x70(%r10)
   1822	vmovdqa		%xmm11,-0x60(%r10)
   1823	vmovdqa		%xmm12,-0x50(%r10)
   1824	vmovdqa		%xmm13,-0x40(%r10)
   1825	vmovdqa		%xmm14,-0x30(%r10)
   1826	vmovdqa		%xmm15,-0x20(%r10)
   1827.Ldo_avx2_body$suffix:
   1828___
   1829$code.=<<___;
   1830	lea		.Lconst(%rip),%rcx
   1831	lea		48+64($ctx),$ctx	# size optimization
   1832	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
   1833
   1834	# expand and copy pre-calculated table to stack
   1835	vmovdqu		`16*0-64`($ctx),%x#$T2
   1836	and		\$-512,%rsp
   1837	vmovdqu		`16*1-64`($ctx),%x#$T3
   1838	vmovdqu		`16*2-64`($ctx),%x#$T4
   1839	vmovdqu		`16*3-64`($ctx),%x#$D0
   1840	vmovdqu		`16*4-64`($ctx),%x#$D1
   1841	vmovdqu		`16*5-64`($ctx),%x#$D2
   1842	lea		0x90(%rsp),%rax		# size optimization
   1843	vmovdqu		`16*6-64`($ctx),%x#$D3
   1844	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
   1845	vmovdqu		`16*7-64`($ctx),%x#$D4
   1846	vpermd		$T3,$T0,$T3
   1847	vmovdqu		`16*8-64`($ctx),%x#$MASK
   1848	vpermd		$T4,$T0,$T4
   1849	vmovdqa		$T2,0x00(%rsp)
   1850	vpermd		$D0,$T0,$D0
   1851	vmovdqa		$T3,0x20-0x90(%rax)
   1852	vpermd		$D1,$T0,$D1
   1853	vmovdqa		$T4,0x40-0x90(%rax)
   1854	vpermd		$D2,$T0,$D2
   1855	vmovdqa		$D0,0x60-0x90(%rax)
   1856	vpermd		$D3,$T0,$D3
   1857	vmovdqa		$D1,0x80-0x90(%rax)
   1858	vpermd		$D4,$T0,$D4
   1859	vmovdqa		$D2,0xa0-0x90(%rax)
   1860	vpermd		$MASK,$T0,$MASK
   1861	vmovdqa		$D3,0xc0-0x90(%rax)
   1862	vmovdqa		$D4,0xe0-0x90(%rax)
   1863	vmovdqa		$MASK,0x100-0x90(%rax)
   1864	vmovdqa		64(%rcx),$MASK		# .Lmask26
   1865
   1866	################################################################
   1867	# load input
   1868	vmovdqu		16*0($inp),%x#$T0
   1869	vmovdqu		16*1($inp),%x#$T1
   1870	vinserti128	\$1,16*2($inp),$T0,$T0
   1871	vinserti128	\$1,16*3($inp),$T1,$T1
   1872	lea		16*4($inp),$inp
   1873
   1874	vpsrldq		\$6,$T0,$T2		# splat input
   1875	vpsrldq		\$6,$T1,$T3
   1876	vpunpckhqdq	$T1,$T0,$T4		# 4
   1877	vpunpcklqdq	$T3,$T2,$T2		# 2:3
   1878	vpunpcklqdq	$T1,$T0,$T0		# 0:1
   1879
   1880	vpsrlq		\$30,$T2,$T3
   1881	vpsrlq		\$4,$T2,$T2
   1882	vpsrlq		\$26,$T0,$T1
   1883	vpsrlq		\$40,$T4,$T4		# 4
   1884	vpand		$MASK,$T2,$T2		# 2
   1885	vpand		$MASK,$T0,$T0		# 0
   1886	vpand		$MASK,$T1,$T1		# 1
   1887	vpand		$MASK,$T3,$T3		# 3
   1888	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
   1889
   1890	vpaddq		$H2,$T2,$H2		# accumulate input
   1891	sub		\$64,$len
   1892	jz		.Ltail_avx2$suffix
   1893	jmp		.Loop_avx2$suffix
   1894
   1895.align	32
   1896.Loop_avx2$suffix:
   1897	################################################################
   1898	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
   1899	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
   1900	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
   1901	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
   1902	#   \________/\__________/
   1903	################################################################
   1904	#vpaddq		$H2,$T2,$H2		# accumulate input
   1905	vpaddq		$H0,$T0,$H0
   1906	vmovdqa		`32*0`(%rsp),$T0	# r0^4
   1907	vpaddq		$H1,$T1,$H1
   1908	vmovdqa		`32*1`(%rsp),$T1	# r1^4
   1909	vpaddq		$H3,$T3,$H3
   1910	vmovdqa		`32*3`(%rsp),$T2	# r2^4
   1911	vpaddq		$H4,$T4,$H4
   1912	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
   1913	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
   1914
   1915	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
   1916	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
   1917	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
   1918	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
   1919	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
   1920	#
   1921	# however, as h2 is "chronologically" first one available pull
   1922	# corresponding operations up, so it's
   1923	#
   1924	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
   1925	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
   1926	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
   1927	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
   1928	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
   1929
   1930	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
   1931	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
   1932	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
   1933	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
   1934	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
   1935
   1936	vpmuludq	$H0,$T1,$T4		# h0*r1
   1937	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
   1938	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
   1939	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
   1940	vpmuludq	$H3,$T1,$T4		# h3*r1
   1941	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
   1942	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
   1943	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
   1944	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
   1945
   1946	vpmuludq	$H0,$T0,$T4		# h0*r0
   1947	vpmuludq	$H1,$T0,$H2		# h1*r0
   1948	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
   1949	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
   1950	vpmuludq	$H3,$T0,$T4		# h3*r0
   1951	vpmuludq	$H4,$T0,$H2		# h4*r0
   1952	 vmovdqu	16*0($inp),%x#$T0	# load input
   1953	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
   1954	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
   1955	 vinserti128	\$1,16*2($inp),$T0,$T0
   1956
   1957	vpmuludq	$H3,$T1,$T4		# h3*s2
   1958	vpmuludq	$H4,$T1,$H2		# h4*s2
   1959	 vmovdqu	16*1($inp),%x#$T1
   1960	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
   1961	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
   1962	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
   1963	vpmuludq	$H1,$T2,$T4		# h1*r2
   1964	vpmuludq	$H0,$T2,$T2		# h0*r2
   1965	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
   1966	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
   1967	 vinserti128	\$1,16*3($inp),$T1,$T1
   1968	 lea		16*4($inp),$inp
   1969
   1970	vpmuludq	$H1,$H2,$T4		# h1*r3
   1971	vpmuludq	$H0,$H2,$H2		# h0*r3
   1972	 vpsrldq	\$6,$T0,$T2		# splat input
   1973	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
   1974	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
   1975	vpmuludq	$H3,$T3,$T4		# h3*s3
   1976	vpmuludq	$H4,$T3,$H2		# h4*s3
   1977	 vpsrldq	\$6,$T1,$T3
   1978	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
   1979	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
   1980	 vpunpckhqdq	$T1,$T0,$T4		# 4
   1981
   1982	vpmuludq	$H3,$S4,$H3		# h3*s4
   1983	vpmuludq	$H4,$S4,$H4		# h4*s4
   1984	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
   1985	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
   1986	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
   1987	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
   1988	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
   1989	vpmuludq	$H1,$S4,$H0		# h1*s4
   1990	vmovdqa		64(%rcx),$MASK		# .Lmask26
   1991	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
   1992	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
   1993
   1994	################################################################
   1995	# lazy reduction (interleaved with tail of input splat)
   1996
   1997	vpsrlq		\$26,$H3,$D3
   1998	vpand		$MASK,$H3,$H3
   1999	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2000
   2001	vpsrlq		\$26,$H0,$D0
   2002	vpand		$MASK,$H0,$H0
   2003	vpaddq		$D0,$D1,$H1		# h0 -> h1
   2004
   2005	vpsrlq		\$26,$H4,$D4
   2006	vpand		$MASK,$H4,$H4
   2007
   2008	 vpsrlq		\$4,$T3,$T2
   2009
   2010	vpsrlq		\$26,$H1,$D1
   2011	vpand		$MASK,$H1,$H1
   2012	vpaddq		$D1,$H2,$H2		# h1 -> h2
   2013
   2014	vpaddq		$D4,$H0,$H0
   2015	vpsllq		\$2,$D4,$D4
   2016	vpaddq		$D4,$H0,$H0		# h4 -> h0
   2017
   2018	 vpand		$MASK,$T2,$T2		# 2
   2019	 vpsrlq		\$26,$T0,$T1
   2020
   2021	vpsrlq		\$26,$H2,$D2
   2022	vpand		$MASK,$H2,$H2
   2023	vpaddq		$D2,$H3,$H3		# h2 -> h3
   2024
   2025	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
   2026	 vpsrlq		\$30,$T3,$T3
   2027
   2028	vpsrlq		\$26,$H0,$D0
   2029	vpand		$MASK,$H0,$H0
   2030	vpaddq		$D0,$H1,$H1		# h0 -> h1
   2031
   2032	 vpsrlq		\$40,$T4,$T4		# 4
   2033
   2034	vpsrlq		\$26,$H3,$D3
   2035	vpand		$MASK,$H3,$H3
   2036	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2037
   2038	 vpand		$MASK,$T0,$T0		# 0
   2039	 vpand		$MASK,$T1,$T1		# 1
   2040	 vpand		$MASK,$T3,$T3		# 3
   2041	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
   2042
   2043	sub		\$64,$len
   2044	jnz		.Loop_avx2$suffix
   2045
   2046	.byte		0x66,0x90
   2047.Ltail_avx2$suffix:
   2048	################################################################
   2049	# while above multiplications were by r^4 in all lanes, in last
   2050	# iteration we multiply least significant lane by r^4 and most
   2051	# significant one by r, so copy of above except that references
   2052	# to the precomputed table are displaced by 4...
   2053
   2054	#vpaddq		$H2,$T2,$H2		# accumulate input
   2055	vpaddq		$H0,$T0,$H0
   2056	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
   2057	vpaddq		$H1,$T1,$H1
   2058	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
   2059	vpaddq		$H3,$T3,$H3
   2060	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
   2061	vpaddq		$H4,$T4,$H4
   2062	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
   2063	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
   2064
   2065	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
   2066	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
   2067	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
   2068	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
   2069	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
   2070
   2071	vpmuludq	$H0,$T1,$T4		# h0*r1
   2072	vpmuludq	$H1,$T1,$H2		# h1*r1
   2073	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
   2074	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
   2075	vpmuludq	$H3,$T1,$T4		# h3*r1
   2076	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
   2077	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
   2078	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
   2079
   2080	vpmuludq	$H0,$T0,$T4		# h0*r0
   2081	vpmuludq	$H1,$T0,$H2		# h1*r0
   2082	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
   2083	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
   2084	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
   2085	vpmuludq	$H3,$T0,$T4		# h3*r0
   2086	vpmuludq	$H4,$T0,$H2		# h4*r0
   2087	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
   2088	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
   2089
   2090	vpmuludq	$H3,$T1,$T4		# h3*s2
   2091	vpmuludq	$H4,$T1,$H2		# h4*s2
   2092	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
   2093	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
   2094	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
   2095	vpmuludq	$H1,$T2,$T4		# h1*r2
   2096	vpmuludq	$H0,$T2,$T2		# h0*r2
   2097	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
   2098	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
   2099
   2100	vpmuludq	$H1,$H2,$T4		# h1*r3
   2101	vpmuludq	$H0,$H2,$H2		# h0*r3
   2102	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
   2103	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
   2104	vpmuludq	$H3,$T3,$T4		# h3*s3
   2105	vpmuludq	$H4,$T3,$H2		# h4*s3
   2106	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
   2107	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
   2108
   2109	vpmuludq	$H3,$S4,$H3		# h3*s4
   2110	vpmuludq	$H4,$S4,$H4		# h4*s4
   2111	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
   2112	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
   2113	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
   2114	vpmuludq	$H1,$S4,$H0		# h1*s4
   2115	vmovdqa		64(%rcx),$MASK		# .Lmask26
   2116	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
   2117	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
   2118
   2119	################################################################
   2120	# horizontal addition
   2121
   2122	vpsrldq		\$8,$D1,$T1
   2123	vpsrldq		\$8,$H2,$T2
   2124	vpsrldq		\$8,$H3,$T3
   2125	vpsrldq		\$8,$H4,$T4
   2126	vpsrldq		\$8,$H0,$T0
   2127	vpaddq		$T1,$D1,$D1
   2128	vpaddq		$T2,$H2,$H2
   2129	vpaddq		$T3,$H3,$H3
   2130	vpaddq		$T4,$H4,$H4
   2131	vpaddq		$T0,$H0,$H0
   2132
   2133	vpermq		\$0x2,$H3,$T3
   2134	vpermq		\$0x2,$H4,$T4
   2135	vpermq		\$0x2,$H0,$T0
   2136	vpermq		\$0x2,$D1,$T1
   2137	vpermq		\$0x2,$H2,$T2
   2138	vpaddq		$T3,$H3,$H3
   2139	vpaddq		$T4,$H4,$H4
   2140	vpaddq		$T0,$H0,$H0
   2141	vpaddq		$T1,$D1,$D1
   2142	vpaddq		$T2,$H2,$H2
   2143
   2144	################################################################
   2145	# lazy reduction
   2146
   2147	vpsrlq		\$26,$H3,$D3
   2148	vpand		$MASK,$H3,$H3
   2149	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2150
   2151	vpsrlq		\$26,$H0,$D0
   2152	vpand		$MASK,$H0,$H0
   2153	vpaddq		$D0,$D1,$H1		# h0 -> h1
   2154
   2155	vpsrlq		\$26,$H4,$D4
   2156	vpand		$MASK,$H4,$H4
   2157
   2158	vpsrlq		\$26,$H1,$D1
   2159	vpand		$MASK,$H1,$H1
   2160	vpaddq		$D1,$H2,$H2		# h1 -> h2
   2161
   2162	vpaddq		$D4,$H0,$H0
   2163	vpsllq		\$2,$D4,$D4
   2164	vpaddq		$D4,$H0,$H0		# h4 -> h0
   2165
   2166	vpsrlq		\$26,$H2,$D2
   2167	vpand		$MASK,$H2,$H2
   2168	vpaddq		$D2,$H3,$H3		# h2 -> h3
   2169
   2170	vpsrlq		\$26,$H0,$D0
   2171	vpand		$MASK,$H0,$H0
   2172	vpaddq		$D0,$H1,$H1		# h0 -> h1
   2173
   2174	vpsrlq		\$26,$H3,$D3
   2175	vpand		$MASK,$H3,$H3
   2176	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2177
   2178	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
   2179	vmovd		%x#$H1,`4*1-48-64`($ctx)
   2180	vmovd		%x#$H2,`4*2-48-64`($ctx)
   2181	vmovd		%x#$H3,`4*3-48-64`($ctx)
   2182	vmovd		%x#$H4,`4*4-48-64`($ctx)
   2183___
   2184$code.=<<___	if ($win64);
   2185	vmovdqa		-0xb0(%r10),%xmm6
   2186	vmovdqa		-0xa0(%r10),%xmm7
   2187	vmovdqa		-0x90(%r10),%xmm8
   2188	vmovdqa		-0x80(%r10),%xmm9
   2189	vmovdqa		-0x70(%r10),%xmm10
   2190	vmovdqa		-0x60(%r10),%xmm11
   2191	vmovdqa		-0x50(%r10),%xmm12
   2192	vmovdqa		-0x40(%r10),%xmm13
   2193	vmovdqa		-0x30(%r10),%xmm14
   2194	vmovdqa		-0x20(%r10),%xmm15
   2195	lea		-8(%r10),%rsp
   2196.Ldo_avx2_epilogue$suffix:
   2197___
   2198$code.=<<___	if (!$win64);
   2199	lea		-8(%r10),%rsp
   2200.cfi_def_cfa_register	%rsp
   2201___
   2202$code.=<<___;
   2203	vzeroupper
   2204	RET
   2205.cfi_endproc
   2206___
   2207if($avx > 2 && $avx512) {
   2208my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
   2209my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
   2210my $PADBIT="%zmm30";
   2211
   2212map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
   2213map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
   2214map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
   2215map(s/%y/%z/,($MASK));
   2216
   2217$code.=<<___;
   2218.cfi_startproc
   2219.Lblocks_avx512:
   2220	mov		\$15,%eax
   2221	kmovw		%eax,%k2
   2222___
   2223$code.=<<___	if (!$win64);
   2224	lea		8(%rsp),%r10
   2225.cfi_def_cfa_register	%r10
   2226	sub		\$0x128,%rsp
   2227___
   2228$code.=<<___	if ($win64);
   2229	lea		8(%rsp),%r10
   2230	sub		\$0x1c8,%rsp
   2231	vmovdqa		%xmm6,-0xb0(%r10)
   2232	vmovdqa		%xmm7,-0xa0(%r10)
   2233	vmovdqa		%xmm8,-0x90(%r10)
   2234	vmovdqa		%xmm9,-0x80(%r10)
   2235	vmovdqa		%xmm10,-0x70(%r10)
   2236	vmovdqa		%xmm11,-0x60(%r10)
   2237	vmovdqa		%xmm12,-0x50(%r10)
   2238	vmovdqa		%xmm13,-0x40(%r10)
   2239	vmovdqa		%xmm14,-0x30(%r10)
   2240	vmovdqa		%xmm15,-0x20(%r10)
   2241.Ldo_avx512_body:
   2242___
   2243$code.=<<___;
   2244	lea		.Lconst(%rip),%rcx
   2245	lea		48+64($ctx),$ctx	# size optimization
   2246	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
   2247
   2248	# expand pre-calculated table
   2249	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
   2250	and		\$-512,%rsp
   2251	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
   2252	mov		\$0x20,%rax
   2253	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
   2254	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
   2255	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
   2256	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
   2257	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
   2258	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
   2259	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
   2260	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
   2261	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
   2262	vpermd		$D1,$T2,$R1
   2263	vpermd		$T0,$T2,$S1
   2264	vpermd		$D2,$T2,$R2
   2265	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
   2266	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
   2267	vpermd		$T1,$T2,$S2
   2268	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
   2269	 vpsrlq		\$32,$R1,$T1
   2270	vpermd		$D3,$T2,$R3
   2271	vmovdqa64	$S1,0x40(%rsp){%k2}
   2272	vpermd		$T3,$T2,$S3
   2273	vpermd		$D4,$T2,$R4
   2274	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
   2275	vpermd		$T4,$T2,$S4
   2276	vmovdqa64	$S2,0x80(%rsp){%k2}
   2277	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
   2278	vmovdqa64	$S3,0xc0(%rsp){%k2}
   2279	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
   2280	vmovdqa64	$S4,0x100(%rsp){%k2}
   2281
   2282	################################################################
   2283	# calculate 5th through 8th powers of the key
   2284	#
   2285	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
   2286	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
   2287	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
   2288	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
   2289	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
   2290
   2291	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
   2292	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
   2293	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
   2294	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
   2295	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
   2296	 vpsrlq		\$32,$R2,$T2
   2297
   2298	vpmuludq	$T1,$S4,$M0
   2299	vpmuludq	$T1,$R0,$M1
   2300	vpmuludq	$T1,$R1,$M2
   2301	vpmuludq	$T1,$R2,$M3
   2302	vpmuludq	$T1,$R3,$M4
   2303	 vpsrlq		\$32,$R3,$T3
   2304	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
   2305	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
   2306	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
   2307	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
   2308	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
   2309
   2310	vpmuludq	$T2,$S3,$M0
   2311	vpmuludq	$T2,$S4,$M1
   2312	vpmuludq	$T2,$R1,$M3
   2313	vpmuludq	$T2,$R2,$M4
   2314	vpmuludq	$T2,$R0,$M2
   2315	 vpsrlq		\$32,$R4,$T4
   2316	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
   2317	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
   2318	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
   2319	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
   2320	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
   2321
   2322	vpmuludq	$T3,$S2,$M0
   2323	vpmuludq	$T3,$R0,$M3
   2324	vpmuludq	$T3,$R1,$M4
   2325	vpmuludq	$T3,$S3,$M1
   2326	vpmuludq	$T3,$S4,$M2
   2327	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
   2328	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
   2329	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
   2330	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
   2331	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
   2332
   2333	vpmuludq	$T4,$S4,$M3
   2334	vpmuludq	$T4,$R0,$M4
   2335	vpmuludq	$T4,$S1,$M0
   2336	vpmuludq	$T4,$S2,$M1
   2337	vpmuludq	$T4,$S3,$M2
   2338	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
   2339	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
   2340	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
   2341	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
   2342	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
   2343
   2344	################################################################
   2345	# load input
   2346	vmovdqu64	16*0($inp),%z#$T3
   2347	vmovdqu64	16*4($inp),%z#$T4
   2348	lea		16*8($inp),$inp
   2349
   2350	################################################################
   2351	# lazy reduction
   2352
   2353	vpsrlq		\$26,$D3,$M3
   2354	vpandq		$MASK,$D3,$D3
   2355	vpaddq		$M3,$D4,$D4		# d3 -> d4
   2356
   2357	vpsrlq		\$26,$D0,$M0
   2358	vpandq		$MASK,$D0,$D0
   2359	vpaddq		$M0,$D1,$D1		# d0 -> d1
   2360
   2361	vpsrlq		\$26,$D4,$M4
   2362	vpandq		$MASK,$D4,$D4
   2363
   2364	vpsrlq		\$26,$D1,$M1
   2365	vpandq		$MASK,$D1,$D1
   2366	vpaddq		$M1,$D2,$D2		# d1 -> d2
   2367
   2368	vpaddq		$M4,$D0,$D0
   2369	vpsllq		\$2,$M4,$M4
   2370	vpaddq		$M4,$D0,$D0		# d4 -> d0
   2371
   2372	vpsrlq		\$26,$D2,$M2
   2373	vpandq		$MASK,$D2,$D2
   2374	vpaddq		$M2,$D3,$D3		# d2 -> d3
   2375
   2376	vpsrlq		\$26,$D0,$M0
   2377	vpandq		$MASK,$D0,$D0
   2378	vpaddq		$M0,$D1,$D1		# d0 -> d1
   2379
   2380	vpsrlq		\$26,$D3,$M3
   2381	vpandq		$MASK,$D3,$D3
   2382	vpaddq		$M3,$D4,$D4		# d3 -> d4
   2383
   2384	################################################################
   2385	# at this point we have 14243444 in $R0-$S4 and 05060708 in
   2386	# $D0-$D4, ...
   2387
   2388	vpunpcklqdq	$T4,$T3,$T0	# transpose input
   2389	vpunpckhqdq	$T4,$T3,$T4
   2390
   2391	# ... since input 64-bit lanes are ordered as 73625140, we could
   2392	# "vperm" it to 76543210 (here and in each loop iteration), *or*
   2393	# we could just flow along, hence the goal for $R0-$S4 is
   2394	# 1858286838784888 ...
   2395
   2396	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
   2397	mov		\$0x7777,%eax
   2398	kmovw		%eax,%k1
   2399
   2400	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
   2401	vpermd		$R1,$M0,$R1
   2402	vpermd		$R2,$M0,$R2
   2403	vpermd		$R3,$M0,$R3
   2404	vpermd		$R4,$M0,$R4
   2405
   2406	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
   2407	vpermd		$D1,$M0,${R1}{%k1}
   2408	vpermd		$D2,$M0,${R2}{%k1}
   2409	vpermd		$D3,$M0,${R3}{%k1}
   2410	vpermd		$D4,$M0,${R4}{%k1}
   2411
   2412	vpslld		\$2,$R1,$S1		# *5
   2413	vpslld		\$2,$R2,$S2
   2414	vpslld		\$2,$R3,$S3
   2415	vpslld		\$2,$R4,$S4
   2416	vpaddd		$R1,$S1,$S1
   2417	vpaddd		$R2,$S2,$S2
   2418	vpaddd		$R3,$S3,$S3
   2419	vpaddd		$R4,$S4,$S4
   2420
   2421	vpbroadcastq	32(%rcx),$PADBIT	# .L129
   2422
   2423	vpsrlq		\$52,$T0,$T2		# splat input
   2424	vpsllq		\$12,$T4,$T3
   2425	vporq		$T3,$T2,$T2
   2426	vpsrlq		\$26,$T0,$T1
   2427	vpsrlq		\$14,$T4,$T3
   2428	vpsrlq		\$40,$T4,$T4		# 4
   2429	vpandq		$MASK,$T2,$T2		# 2
   2430	vpandq		$MASK,$T0,$T0		# 0
   2431	#vpandq		$MASK,$T1,$T1		# 1
   2432	#vpandq		$MASK,$T3,$T3		# 3
   2433	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
   2434
   2435	vpaddq		$H2,$T2,$H2		# accumulate input
   2436	sub		\$192,$len
   2437	jbe		.Ltail_avx512
   2438	jmp		.Loop_avx512
   2439
   2440.align	32
   2441.Loop_avx512:
   2442	################################################################
   2443	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
   2444	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
   2445	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
   2446	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
   2447	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
   2448	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
   2449	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
   2450	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
   2451	#   \________/\___________/
   2452	################################################################
   2453	#vpaddq		$H2,$T2,$H2		# accumulate input
   2454
   2455	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
   2456	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
   2457	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
   2458	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
   2459	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
   2460	#
   2461	# however, as h2 is "chronologically" first one available pull
   2462	# corresponding operations up, so it's
   2463	#
   2464	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
   2465	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
   2466	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
   2467	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
   2468	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
   2469
   2470	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
   2471	 vpaddq		$H0,$T0,$H0
   2472	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
   2473	 vpandq		$MASK,$T1,$T1		# 1
   2474	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
   2475	 vpandq		$MASK,$T3,$T3		# 3
   2476	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
   2477	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
   2478	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
   2479	 vpaddq		$H1,$T1,$H1		# accumulate input
   2480	 vpaddq		$H3,$T3,$H3
   2481	 vpaddq		$H4,$T4,$H4
   2482
   2483	  vmovdqu64	16*0($inp),$T3		# load input
   2484	  vmovdqu64	16*4($inp),$T4
   2485	  lea		16*8($inp),$inp
   2486	vpmuludq	$H0,$R3,$M3
   2487	vpmuludq	$H0,$R4,$M4
   2488	vpmuludq	$H0,$R0,$M0
   2489	vpmuludq	$H0,$R1,$M1
   2490	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
   2491	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
   2492	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
   2493	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
   2494
   2495	vpmuludq	$H1,$R2,$M3
   2496	vpmuludq	$H1,$R3,$M4
   2497	vpmuludq	$H1,$S4,$M0
   2498	vpmuludq	$H0,$R2,$M2
   2499	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
   2500	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
   2501	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
   2502	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
   2503
   2504	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
   2505	  vpunpckhqdq	$T4,$T3,$T4
   2506
   2507	vpmuludq	$H3,$R0,$M3
   2508	vpmuludq	$H3,$R1,$M4
   2509	vpmuludq	$H1,$R0,$M1
   2510	vpmuludq	$H1,$R1,$M2
   2511	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
   2512	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
   2513	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
   2514	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
   2515
   2516	vpmuludq	$H4,$S4,$M3
   2517	vpmuludq	$H4,$R0,$M4
   2518	vpmuludq	$H3,$S2,$M0
   2519	vpmuludq	$H3,$S3,$M1
   2520	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
   2521	vpmuludq	$H3,$S4,$M2
   2522	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
   2523	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
   2524	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
   2525	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
   2526
   2527	vpmuludq	$H4,$S1,$M0
   2528	vpmuludq	$H4,$S2,$M1
   2529	vpmuludq	$H4,$S3,$M2
   2530	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
   2531	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
   2532	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
   2533
   2534	################################################################
   2535	# lazy reduction (interleaved with input splat)
   2536
   2537	 vpsrlq		\$52,$T0,$T2		# splat input
   2538	 vpsllq		\$12,$T4,$T3
   2539
   2540	vpsrlq		\$26,$D3,$H3
   2541	vpandq		$MASK,$D3,$D3
   2542	vpaddq		$H3,$D4,$H4		# h3 -> h4
   2543
   2544	 vporq		$T3,$T2,$T2
   2545
   2546	vpsrlq		\$26,$H0,$D0
   2547	vpandq		$MASK,$H0,$H0
   2548	vpaddq		$D0,$H1,$H1		# h0 -> h1
   2549
   2550	 vpandq		$MASK,$T2,$T2		# 2
   2551
   2552	vpsrlq		\$26,$H4,$D4
   2553	vpandq		$MASK,$H4,$H4
   2554
   2555	vpsrlq		\$26,$H1,$D1
   2556	vpandq		$MASK,$H1,$H1
   2557	vpaddq		$D1,$H2,$H2		# h1 -> h2
   2558
   2559	vpaddq		$D4,$H0,$H0
   2560	vpsllq		\$2,$D4,$D4
   2561	vpaddq		$D4,$H0,$H0		# h4 -> h0
   2562
   2563	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
   2564	 vpsrlq		\$26,$T0,$T1
   2565
   2566	vpsrlq		\$26,$H2,$D2
   2567	vpandq		$MASK,$H2,$H2
   2568	vpaddq		$D2,$D3,$H3		# h2 -> h3
   2569
   2570	 vpsrlq		\$14,$T4,$T3
   2571
   2572	vpsrlq		\$26,$H0,$D0
   2573	vpandq		$MASK,$H0,$H0
   2574	vpaddq		$D0,$H1,$H1		# h0 -> h1
   2575
   2576	 vpsrlq		\$40,$T4,$T4		# 4
   2577
   2578	vpsrlq		\$26,$H3,$D3
   2579	vpandq		$MASK,$H3,$H3
   2580	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2581
   2582	 vpandq		$MASK,$T0,$T0		# 0
   2583	 #vpandq	$MASK,$T1,$T1		# 1
   2584	 #vpandq	$MASK,$T3,$T3		# 3
   2585	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
   2586
   2587	sub		\$128,$len
   2588	ja		.Loop_avx512
   2589
   2590.Ltail_avx512:
   2591	################################################################
   2592	# while above multiplications were by r^8 in all lanes, in last
   2593	# iteration we multiply least significant lane by r^8 and most
   2594	# significant one by r, that's why table gets shifted...
   2595
   2596	vpsrlq		\$32,$R0,$R0		# 0105020603070408
   2597	vpsrlq		\$32,$R1,$R1
   2598	vpsrlq		\$32,$R2,$R2
   2599	vpsrlq		\$32,$S3,$S3
   2600	vpsrlq		\$32,$S4,$S4
   2601	vpsrlq		\$32,$R3,$R3
   2602	vpsrlq		\$32,$R4,$R4
   2603	vpsrlq		\$32,$S1,$S1
   2604	vpsrlq		\$32,$S2,$S2
   2605
   2606	################################################################
   2607	# load either next or last 64 byte of input
   2608	lea		($inp,$len),$inp
   2609
   2610	#vpaddq		$H2,$T2,$H2		# accumulate input
   2611	vpaddq		$H0,$T0,$H0
   2612
   2613	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
   2614	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
   2615	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
   2616	 vpandq		$MASK,$T1,$T1		# 1
   2617	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
   2618	 vpandq		$MASK,$T3,$T3		# 3
   2619	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
   2620	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
   2621	 vpaddq		$H1,$T1,$H1		# accumulate input
   2622	 vpaddq		$H3,$T3,$H3
   2623	 vpaddq		$H4,$T4,$H4
   2624
   2625	  vmovdqu	16*0($inp),%x#$T0
   2626	vpmuludq	$H0,$R3,$M3
   2627	vpmuludq	$H0,$R4,$M4
   2628	vpmuludq	$H0,$R0,$M0
   2629	vpmuludq	$H0,$R1,$M1
   2630	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
   2631	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
   2632	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
   2633	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
   2634
   2635	  vmovdqu	16*1($inp),%x#$T1
   2636	vpmuludq	$H1,$R2,$M3
   2637	vpmuludq	$H1,$R3,$M4
   2638	vpmuludq	$H1,$S4,$M0
   2639	vpmuludq	$H0,$R2,$M2
   2640	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
   2641	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
   2642	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
   2643	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
   2644
   2645	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
   2646	vpmuludq	$H3,$R0,$M3
   2647	vpmuludq	$H3,$R1,$M4
   2648	vpmuludq	$H1,$R0,$M1
   2649	vpmuludq	$H1,$R1,$M2
   2650	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
   2651	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
   2652	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
   2653	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
   2654
   2655	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
   2656	vpmuludq	$H4,$S4,$M3
   2657	vpmuludq	$H4,$R0,$M4
   2658	vpmuludq	$H3,$S2,$M0
   2659	vpmuludq	$H3,$S3,$M1
   2660	vpmuludq	$H3,$S4,$M2
   2661	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
   2662	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
   2663	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
   2664	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
   2665	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
   2666
   2667	vpmuludq	$H4,$S1,$M0
   2668	vpmuludq	$H4,$S2,$M1
   2669	vpmuludq	$H4,$S3,$M2
   2670	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
   2671	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
   2672	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
   2673
   2674	################################################################
   2675	# horizontal addition
   2676
   2677	mov		\$1,%eax
   2678	vpermq		\$0xb1,$H3,$D3
   2679	vpermq		\$0xb1,$D4,$H4
   2680	vpermq		\$0xb1,$H0,$D0
   2681	vpermq		\$0xb1,$H1,$D1
   2682	vpermq		\$0xb1,$H2,$D2
   2683	vpaddq		$D3,$H3,$H3
   2684	vpaddq		$D4,$H4,$H4
   2685	vpaddq		$D0,$H0,$H0
   2686	vpaddq		$D1,$H1,$H1
   2687	vpaddq		$D2,$H2,$H2
   2688
   2689	kmovw		%eax,%k3
   2690	vpermq		\$0x2,$H3,$D3
   2691	vpermq		\$0x2,$H4,$D4
   2692	vpermq		\$0x2,$H0,$D0
   2693	vpermq		\$0x2,$H1,$D1
   2694	vpermq		\$0x2,$H2,$D2
   2695	vpaddq		$D3,$H3,$H3
   2696	vpaddq		$D4,$H4,$H4
   2697	vpaddq		$D0,$H0,$H0
   2698	vpaddq		$D1,$H1,$H1
   2699	vpaddq		$D2,$H2,$H2
   2700
   2701	vextracti64x4	\$0x1,$H3,%y#$D3
   2702	vextracti64x4	\$0x1,$H4,%y#$D4
   2703	vextracti64x4	\$0x1,$H0,%y#$D0
   2704	vextracti64x4	\$0x1,$H1,%y#$D1
   2705	vextracti64x4	\$0x1,$H2,%y#$D2
   2706	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
   2707	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
   2708	vpaddq		$D0,$H0,${H0}{%k3}{z}
   2709	vpaddq		$D1,$H1,${H1}{%k3}{z}
   2710	vpaddq		$D2,$H2,${H2}{%k3}{z}
   2711___
   2712map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
   2713map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
   2714$code.=<<___;
   2715	################################################################
   2716	# lazy reduction (interleaved with input splat)
   2717
   2718	vpsrlq		\$26,$H3,$D3
   2719	vpand		$MASK,$H3,$H3
   2720	 vpsrldq	\$6,$T0,$T2		# splat input
   2721	 vpsrldq	\$6,$T1,$T3
   2722	 vpunpckhqdq	$T1,$T0,$T4		# 4
   2723	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2724
   2725	vpsrlq		\$26,$H0,$D0
   2726	vpand		$MASK,$H0,$H0
   2727	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
   2728	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
   2729	vpaddq		$D0,$H1,$H1		# h0 -> h1
   2730
   2731	vpsrlq		\$26,$H4,$D4
   2732	vpand		$MASK,$H4,$H4
   2733
   2734	vpsrlq		\$26,$H1,$D1
   2735	vpand		$MASK,$H1,$H1
   2736	 vpsrlq		\$30,$T2,$T3
   2737	 vpsrlq		\$4,$T2,$T2
   2738	vpaddq		$D1,$H2,$H2		# h1 -> h2
   2739
   2740	vpaddq		$D4,$H0,$H0
   2741	vpsllq		\$2,$D4,$D4
   2742	 vpsrlq		\$26,$T0,$T1
   2743	 vpsrlq		\$40,$T4,$T4		# 4
   2744	vpaddq		$D4,$H0,$H0		# h4 -> h0
   2745
   2746	vpsrlq		\$26,$H2,$D2
   2747	vpand		$MASK,$H2,$H2
   2748	 vpand		$MASK,$T2,$T2		# 2
   2749	 vpand		$MASK,$T0,$T0		# 0
   2750	vpaddq		$D2,$H3,$H3		# h2 -> h3
   2751
   2752	vpsrlq		\$26,$H0,$D0
   2753	vpand		$MASK,$H0,$H0
   2754	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
   2755	 vpand		$MASK,$T1,$T1		# 1
   2756	vpaddq		$D0,$H1,$H1		# h0 -> h1
   2757
   2758	vpsrlq		\$26,$H3,$D3
   2759	vpand		$MASK,$H3,$H3
   2760	 vpand		$MASK,$T3,$T3		# 3
   2761	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
   2762	vpaddq		$D3,$H4,$H4		# h3 -> h4
   2763
   2764	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
   2765	add		\$64,$len
   2766	jnz		.Ltail_avx2$suffix
   2767
   2768	vpsubq		$T2,$H2,$H2		# undo input accumulation
   2769	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
   2770	vmovd		%x#$H1,`4*1-48-64`($ctx)
   2771	vmovd		%x#$H2,`4*2-48-64`($ctx)
   2772	vmovd		%x#$H3,`4*3-48-64`($ctx)
   2773	vmovd		%x#$H4,`4*4-48-64`($ctx)
   2774	vzeroall
   2775___
   2776$code.=<<___	if ($win64);
   2777	movdqa		-0xb0(%r10),%xmm6
   2778	movdqa		-0xa0(%r10),%xmm7
   2779	movdqa		-0x90(%r10),%xmm8
   2780	movdqa		-0x80(%r10),%xmm9
   2781	movdqa		-0x70(%r10),%xmm10
   2782	movdqa		-0x60(%r10),%xmm11
   2783	movdqa		-0x50(%r10),%xmm12
   2784	movdqa		-0x40(%r10),%xmm13
   2785	movdqa		-0x30(%r10),%xmm14
   2786	movdqa		-0x20(%r10),%xmm15
   2787	lea		-8(%r10),%rsp
   2788.Ldo_avx512_epilogue:
   2789___
   2790$code.=<<___	if (!$win64);
   2791	lea		-8(%r10),%rsp
   2792.cfi_def_cfa_register	%rsp
   2793___
   2794$code.=<<___;
   2795	RET
   2796.cfi_endproc
   2797___
   2798
   2799}
   2800
   2801}
   2802
   2803&declare_function("poly1305_blocks_avx2", 32, 4);
   2804poly1305_blocks_avxN(0);
   2805&end_function("poly1305_blocks_avx2");
   2806
   2807#######################################################################
   2808if ($avx>2) {
   2809# On entry we have input length divisible by 64. But since inner loop
   2810# processes 128 bytes per iteration, cases when length is not divisible
   2811# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
   2812# reason stack layout is kept identical to poly1305_blocks_avx2. If not
   2813# for this tail, we wouldn't have to even allocate stack frame...
   2814
   2815if($kernel) {
   2816	$code .= "#ifdef CONFIG_AS_AVX512\n";
   2817}
   2818
   2819&declare_function("poly1305_blocks_avx512", 32, 4);
   2820poly1305_blocks_avxN(1);
   2821&end_function("poly1305_blocks_avx512");
   2822
   2823if ($kernel) {
   2824	$code .= "#endif\n";
   2825}
   2826
   2827if (!$kernel && $avx>3) {
   2828########################################################################
   2829# VPMADD52 version using 2^44 radix.
   2830#
   2831# One can argue that base 2^52 would be more natural. Well, even though
   2832# some operations would be more natural, one has to recognize couple of
   2833# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
   2834# at amount of multiply-n-accumulate operations. Secondly, it makes it
   2835# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
   2836# reference implementations], which means that more such operations
   2837# would have to be performed in inner loop, which in turn makes critical
   2838# path longer. In other words, even though base 2^44 reduction might
   2839# look less elegant, overall critical path is actually shorter...
   2840
   2841########################################################################
   2842# Layout of opaque area is following.
   2843#
   2844#	unsigned __int64 h[3];		# current hash value base 2^44
   2845#	unsigned __int64 s[2];		# key value*20 base 2^44
   2846#	unsigned __int64 r[3];		# key value base 2^44
   2847#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
   2848#					# r^n positions reflect
   2849#					# placement in register, not
   2850#					# memory, R[3] is R[1]*20
   2851
   2852$code.=<<___;
   2853.type	poly1305_init_base2_44,\@function,3
   2854.align	32
   2855poly1305_init_base2_44:
   2856	xor	%eax,%eax
   2857	mov	%rax,0($ctx)		# initialize hash value
   2858	mov	%rax,8($ctx)
   2859	mov	%rax,16($ctx)
   2860
   2861.Linit_base2_44:
   2862	lea	poly1305_blocks_vpmadd52(%rip),%r10
   2863	lea	poly1305_emit_base2_44(%rip),%r11
   2864
   2865	mov	\$0x0ffffffc0fffffff,%rax
   2866	mov	\$0x0ffffffc0ffffffc,%rcx
   2867	and	0($inp),%rax
   2868	mov	\$0x00000fffffffffff,%r8
   2869	and	8($inp),%rcx
   2870	mov	\$0x00000fffffffffff,%r9
   2871	and	%rax,%r8
   2872	shrd	\$44,%rcx,%rax
   2873	mov	%r8,40($ctx)		# r0
   2874	and	%r9,%rax
   2875	shr	\$24,%rcx
   2876	mov	%rax,48($ctx)		# r1
   2877	lea	(%rax,%rax,4),%rax	# *5
   2878	mov	%rcx,56($ctx)		# r2
   2879	shl	\$2,%rax		# magic <<2
   2880	lea	(%rcx,%rcx,4),%rcx	# *5
   2881	shl	\$2,%rcx		# magic <<2
   2882	mov	%rax,24($ctx)		# s1
   2883	mov	%rcx,32($ctx)		# s2
   2884	movq	\$-1,64($ctx)		# write impossible value
   2885___
   2886$code.=<<___	if ($flavour !~ /elf32/);
   2887	mov	%r10,0(%rdx)
   2888	mov	%r11,8(%rdx)
   2889___
   2890$code.=<<___	if ($flavour =~ /elf32/);
   2891	mov	%r10d,0(%rdx)
   2892	mov	%r11d,4(%rdx)
   2893___
   2894$code.=<<___;
   2895	mov	\$1,%eax
   2896	RET
   2897.size	poly1305_init_base2_44,.-poly1305_init_base2_44
   2898___
   2899{
   2900my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
   2901my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
   2902my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
   2903
   2904$code.=<<___;
   2905.type	poly1305_blocks_vpmadd52,\@function,4
   2906.align	32
   2907poly1305_blocks_vpmadd52:
   2908	shr	\$4,$len
   2909	jz	.Lno_data_vpmadd52		# too short
   2910
   2911	shl	\$40,$padbit
   2912	mov	64($ctx),%r8			# peek on power of the key
   2913
   2914	# if powers of the key are not calculated yet, process up to 3
   2915	# blocks with this single-block subroutine, otherwise ensure that
   2916	# length is divisible by 2 blocks and pass the rest down to next
   2917	# subroutine...
   2918
   2919	mov	\$3,%rax
   2920	mov	\$1,%r10
   2921	cmp	\$4,$len			# is input long
   2922	cmovae	%r10,%rax
   2923	test	%r8,%r8				# is power value impossible?
   2924	cmovns	%r10,%rax
   2925
   2926	and	$len,%rax			# is input of favourable length?
   2927	jz	.Lblocks_vpmadd52_4x
   2928
   2929	sub		%rax,$len
   2930	mov		\$7,%r10d
   2931	mov		\$1,%r11d
   2932	kmovw		%r10d,%k7
   2933	lea		.L2_44_inp_permd(%rip),%r10
   2934	kmovw		%r11d,%k1
   2935
   2936	vmovq		$padbit,%x#$PAD
   2937	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
   2938	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
   2939	vpermq		\$0xcf,$PAD,$PAD
   2940	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
   2941
   2942	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
   2943	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
   2944	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
   2945	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
   2946
   2947	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
   2948	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
   2949
   2950	jmp		.Loop_vpmadd52
   2951
   2952.align	32
   2953.Loop_vpmadd52:
   2954	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
   2955	lea		16($inp),$inp
   2956
   2957	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
   2958	vpsrlvq		$inp_shift,$T0,$T0
   2959	vpandq		$reduc_mask,$T0,$T0
   2960	vporq		$PAD,$T0,$T0
   2961
   2962	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
   2963
   2964	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
   2965	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
   2966	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
   2967
   2968	vpxord		$Dlo,$Dlo,$Dlo
   2969	vpxord		$Dhi,$Dhi,$Dhi
   2970
   2971	vpmadd52luq	$r2r1r0,$H0,$Dlo
   2972	vpmadd52huq	$r2r1r0,$H0,$Dhi
   2973
   2974	vpmadd52luq	$r1r0s2,$H1,$Dlo
   2975	vpmadd52huq	$r1r0s2,$H1,$Dhi
   2976
   2977	vpmadd52luq	$r0s2s1,$H2,$Dlo
   2978	vpmadd52huq	$r0s2s1,$H2,$Dhi
   2979
   2980	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
   2981	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
   2982	vpandq		$reduc_mask,$Dlo,$Dlo
   2983
   2984	vpaddq		$T0,$Dhi,$Dhi
   2985
   2986	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
   2987
   2988	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
   2989
   2990	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
   2991	vpandq		$reduc_mask,$Dlo,$Dlo
   2992
   2993	vpermq		\$0b10010011,$T0,$T0
   2994
   2995	vpaddq		$T0,$Dlo,$Dlo
   2996
   2997	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
   2998
   2999	vpaddq		$T0,$Dlo,$Dlo
   3000	vpsllq		\$2,$T0,$T0
   3001
   3002	vpaddq		$T0,$Dlo,$Dlo
   3003
   3004	dec		%rax			# len-=16
   3005	jnz		.Loop_vpmadd52
   3006
   3007	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
   3008
   3009	test		$len,$len
   3010	jnz		.Lblocks_vpmadd52_4x
   3011
   3012.Lno_data_vpmadd52:
   3013	RET
   3014.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
   3015___
   3016}
   3017{
   3018########################################################################
   3019# As implied by its name 4x subroutine processes 4 blocks in parallel
   3020# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
   3021# and is handled in 256-bit %ymm registers.
   3022
   3023my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
   3024my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
   3025my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
   3026
   3027$code.=<<___;
   3028.type	poly1305_blocks_vpmadd52_4x,\@function,4
   3029.align	32
   3030poly1305_blocks_vpmadd52_4x:
   3031	shr	\$4,$len
   3032	jz	.Lno_data_vpmadd52_4x		# too short
   3033
   3034	shl	\$40,$padbit
   3035	mov	64($ctx),%r8			# peek on power of the key
   3036
   3037.Lblocks_vpmadd52_4x:
   3038	vpbroadcastq	$padbit,$PAD
   3039
   3040	vmovdqa64	.Lx_mask44(%rip),$mask44
   3041	mov		\$5,%eax
   3042	vmovdqa64	.Lx_mask42(%rip),$mask42
   3043	kmovw		%eax,%k1		# used in 2x path
   3044
   3045	test		%r8,%r8			# is power value impossible?
   3046	js		.Linit_vpmadd52		# if it is, then init R[4]
   3047
   3048	vmovq		0($ctx),%x#$H0		# load current hash value
   3049	vmovq		8($ctx),%x#$H1
   3050	vmovq		16($ctx),%x#$H2
   3051
   3052	test		\$3,$len		# is length 4*n+2?
   3053	jnz		.Lblocks_vpmadd52_2x_do
   3054
   3055.Lblocks_vpmadd52_4x_do:
   3056	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
   3057	vpbroadcastq	96($ctx),$R1
   3058	vpbroadcastq	128($ctx),$R2
   3059	vpbroadcastq	160($ctx),$S1
   3060
   3061.Lblocks_vpmadd52_4x_key_loaded:
   3062	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
   3063	vpaddq		$R2,$S2,$S2
   3064	vpsllq		\$2,$S2,$S2
   3065
   3066	test		\$7,$len		# is len 8*n?
   3067	jz		.Lblocks_vpmadd52_8x
   3068
   3069	vmovdqu64	16*0($inp),$T2		# load data
   3070	vmovdqu64	16*2($inp),$T3
   3071	lea		16*4($inp),$inp
   3072
   3073	vpunpcklqdq	$T3,$T2,$T1		# transpose data
   3074	vpunpckhqdq	$T3,$T2,$T3
   3075
   3076	# at this point 64-bit lanes are ordered as 3-1-2-0
   3077
   3078	vpsrlq		\$24,$T3,$T2		# splat the data
   3079	vporq		$PAD,$T2,$T2
   3080	 vpaddq		$T2,$H2,$H2		# accumulate input
   3081	vpandq		$mask44,$T1,$T0
   3082	vpsrlq		\$44,$T1,$T1
   3083	vpsllq		\$20,$T3,$T3
   3084	vporq		$T3,$T1,$T1
   3085	vpandq		$mask44,$T1,$T1
   3086
   3087	sub		\$4,$len
   3088	jz		.Ltail_vpmadd52_4x
   3089	jmp		.Loop_vpmadd52_4x
   3090	ud2
   3091
   3092.align	32
   3093.Linit_vpmadd52:
   3094	vmovq		24($ctx),%x#$S1		# load key
   3095	vmovq		56($ctx),%x#$H2
   3096	vmovq		32($ctx),%x#$S2
   3097	vmovq		40($ctx),%x#$R0
   3098	vmovq		48($ctx),%x#$R1
   3099
   3100	vmovdqa		$R0,$H0
   3101	vmovdqa		$R1,$H1
   3102	vmovdqa		$H2,$R2
   3103
   3104	mov		\$2,%eax
   3105
   3106.Lmul_init_vpmadd52:
   3107	vpxorq		$D0lo,$D0lo,$D0lo
   3108	vpmadd52luq	$H2,$S1,$D0lo
   3109	vpxorq		$D0hi,$D0hi,$D0hi
   3110	vpmadd52huq	$H2,$S1,$D0hi
   3111	vpxorq		$D1lo,$D1lo,$D1lo
   3112	vpmadd52luq	$H2,$S2,$D1lo
   3113	vpxorq		$D1hi,$D1hi,$D1hi
   3114	vpmadd52huq	$H2,$S2,$D1hi
   3115	vpxorq		$D2lo,$D2lo,$D2lo
   3116	vpmadd52luq	$H2,$R0,$D2lo
   3117	vpxorq		$D2hi,$D2hi,$D2hi
   3118	vpmadd52huq	$H2,$R0,$D2hi
   3119
   3120	vpmadd52luq	$H0,$R0,$D0lo
   3121	vpmadd52huq	$H0,$R0,$D0hi
   3122	vpmadd52luq	$H0,$R1,$D1lo
   3123	vpmadd52huq	$H0,$R1,$D1hi
   3124	vpmadd52luq	$H0,$R2,$D2lo
   3125	vpmadd52huq	$H0,$R2,$D2hi
   3126
   3127	vpmadd52luq	$H1,$S2,$D0lo
   3128	vpmadd52huq	$H1,$S2,$D0hi
   3129	vpmadd52luq	$H1,$R0,$D1lo
   3130	vpmadd52huq	$H1,$R0,$D1hi
   3131	vpmadd52luq	$H1,$R1,$D2lo
   3132	vpmadd52huq	$H1,$R1,$D2hi
   3133
   3134	################################################################
   3135	# partial reduction
   3136	vpsrlq		\$44,$D0lo,$tmp
   3137	vpsllq		\$8,$D0hi,$D0hi
   3138	vpandq		$mask44,$D0lo,$H0
   3139	vpaddq		$tmp,$D0hi,$D0hi
   3140
   3141	vpaddq		$D0hi,$D1lo,$D1lo
   3142
   3143	vpsrlq		\$44,$D1lo,$tmp
   3144	vpsllq		\$8,$D1hi,$D1hi
   3145	vpandq		$mask44,$D1lo,$H1
   3146	vpaddq		$tmp,$D1hi,$D1hi
   3147
   3148	vpaddq		$D1hi,$D2lo,$D2lo
   3149
   3150	vpsrlq		\$42,$D2lo,$tmp
   3151	vpsllq		\$10,$D2hi,$D2hi
   3152	vpandq		$mask42,$D2lo,$H2
   3153	vpaddq		$tmp,$D2hi,$D2hi
   3154
   3155	vpaddq		$D2hi,$H0,$H0
   3156	vpsllq		\$2,$D2hi,$D2hi
   3157
   3158	vpaddq		$D2hi,$H0,$H0
   3159
   3160	vpsrlq		\$44,$H0,$tmp		# additional step
   3161	vpandq		$mask44,$H0,$H0
   3162
   3163	vpaddq		$tmp,$H1,$H1
   3164
   3165	dec		%eax
   3166	jz		.Ldone_init_vpmadd52
   3167
   3168	vpunpcklqdq	$R1,$H1,$R1		# 1,2
   3169	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
   3170	vpunpcklqdq	$R2,$H2,$R2
   3171	vpbroadcastq	%x#$H2,%x#$H2
   3172	vpunpcklqdq	$R0,$H0,$R0
   3173	vpbroadcastq	%x#$H0,%x#$H0
   3174
   3175	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
   3176	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
   3177	vpaddq		$R1,$S1,$S1
   3178	vpaddq		$R2,$S2,$S2
   3179	vpsllq		\$2,$S1,$S1
   3180	vpsllq		\$2,$S2,$S2
   3181
   3182	jmp		.Lmul_init_vpmadd52
   3183	ud2
   3184
   3185.align	32
   3186.Ldone_init_vpmadd52:
   3187	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
   3188	vinserti128	\$1,%x#$R2,$H2,$R2
   3189	vinserti128	\$1,%x#$R0,$H0,$R0
   3190
   3191	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
   3192	vpermq		\$0b11011000,$R2,$R2
   3193	vpermq		\$0b11011000,$R0,$R0
   3194
   3195	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
   3196	vpaddq		$R1,$S1,$S1
   3197	vpsllq		\$2,$S1,$S1
   3198
   3199	vmovq		0($ctx),%x#$H0		# load current hash value
   3200	vmovq		8($ctx),%x#$H1
   3201	vmovq		16($ctx),%x#$H2
   3202
   3203	test		\$3,$len		# is length 4*n+2?
   3204	jnz		.Ldone_init_vpmadd52_2x
   3205
   3206	vmovdqu64	$R0,64($ctx)		# save key powers
   3207	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
   3208	vmovdqu64	$R1,96($ctx)
   3209	vpbroadcastq	%x#$R1,$R1
   3210	vmovdqu64	$R2,128($ctx)
   3211	vpbroadcastq	%x#$R2,$R2
   3212	vmovdqu64	$S1,160($ctx)
   3213	vpbroadcastq	%x#$S1,$S1
   3214
   3215	jmp		.Lblocks_vpmadd52_4x_key_loaded
   3216	ud2
   3217
   3218.align	32
   3219.Ldone_init_vpmadd52_2x:
   3220	vmovdqu64	$R0,64($ctx)		# save key powers
   3221	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
   3222	vmovdqu64	$R1,96($ctx)
   3223	vpsrldq		\$8,$R1,$R1
   3224	vmovdqu64	$R2,128($ctx)
   3225	vpsrldq		\$8,$R2,$R2
   3226	vmovdqu64	$S1,160($ctx)
   3227	vpsrldq		\$8,$S1,$S1
   3228	jmp		.Lblocks_vpmadd52_2x_key_loaded
   3229	ud2
   3230
   3231.align	32
   3232.Lblocks_vpmadd52_2x_do:
   3233	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
   3234	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
   3235	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
   3236	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
   3237
   3238.Lblocks_vpmadd52_2x_key_loaded:
   3239	vmovdqu64	16*0($inp),$T2		# load data
   3240	vpxorq		$T3,$T3,$T3
   3241	lea		16*2($inp),$inp
   3242
   3243	vpunpcklqdq	$T3,$T2,$T1		# transpose data
   3244	vpunpckhqdq	$T3,$T2,$T3
   3245
   3246	# at this point 64-bit lanes are ordered as x-1-x-0
   3247
   3248	vpsrlq		\$24,$T3,$T2		# splat the data
   3249	vporq		$PAD,$T2,$T2
   3250	 vpaddq		$T2,$H2,$H2		# accumulate input
   3251	vpandq		$mask44,$T1,$T0
   3252	vpsrlq		\$44,$T1,$T1
   3253	vpsllq		\$20,$T3,$T3
   3254	vporq		$T3,$T1,$T1
   3255	vpandq		$mask44,$T1,$T1
   3256
   3257	jmp		.Ltail_vpmadd52_2x
   3258	ud2
   3259
   3260.align	32
   3261.Loop_vpmadd52_4x:
   3262	#vpaddq		$T2,$H2,$H2		# accumulate input
   3263	vpaddq		$T0,$H0,$H0
   3264	vpaddq		$T1,$H1,$H1
   3265
   3266	vpxorq		$D0lo,$D0lo,$D0lo
   3267	vpmadd52luq	$H2,$S1,$D0lo
   3268	vpxorq		$D0hi,$D0hi,$D0hi
   3269	vpmadd52huq	$H2,$S1,$D0hi
   3270	vpxorq		$D1lo,$D1lo,$D1lo
   3271	vpmadd52luq	$H2,$S2,$D1lo
   3272	vpxorq		$D1hi,$D1hi,$D1hi
   3273	vpmadd52huq	$H2,$S2,$D1hi
   3274	vpxorq		$D2lo,$D2lo,$D2lo
   3275	vpmadd52luq	$H2,$R0,$D2lo
   3276	vpxorq		$D2hi,$D2hi,$D2hi
   3277	vpmadd52huq	$H2,$R0,$D2hi
   3278
   3279	 vmovdqu64	16*0($inp),$T2		# load data
   3280	 vmovdqu64	16*2($inp),$T3
   3281	 lea		16*4($inp),$inp
   3282	vpmadd52luq	$H0,$R0,$D0lo
   3283	vpmadd52huq	$H0,$R0,$D0hi
   3284	vpmadd52luq	$H0,$R1,$D1lo
   3285	vpmadd52huq	$H0,$R1,$D1hi
   3286	vpmadd52luq	$H0,$R2,$D2lo
   3287	vpmadd52huq	$H0,$R2,$D2hi
   3288
   3289	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
   3290	 vpunpckhqdq	$T3,$T2,$T3
   3291	vpmadd52luq	$H1,$S2,$D0lo
   3292	vpmadd52huq	$H1,$S2,$D0hi
   3293	vpmadd52luq	$H1,$R0,$D1lo
   3294	vpmadd52huq	$H1,$R0,$D1hi
   3295	vpmadd52luq	$H1,$R1,$D2lo
   3296	vpmadd52huq	$H1,$R1,$D2hi
   3297
   3298	################################################################
   3299	# partial reduction (interleaved with data splat)
   3300	vpsrlq		\$44,$D0lo,$tmp
   3301	vpsllq		\$8,$D0hi,$D0hi
   3302	vpandq		$mask44,$D0lo,$H0
   3303	vpaddq		$tmp,$D0hi,$D0hi
   3304
   3305	 vpsrlq		\$24,$T3,$T2
   3306	 vporq		$PAD,$T2,$T2
   3307	vpaddq		$D0hi,$D1lo,$D1lo
   3308
   3309	vpsrlq		\$44,$D1lo,$tmp
   3310	vpsllq		\$8,$D1hi,$D1hi
   3311	vpandq		$mask44,$D1lo,$H1
   3312	vpaddq		$tmp,$D1hi,$D1hi
   3313
   3314	 vpandq		$mask44,$T1,$T0
   3315	 vpsrlq		\$44,$T1,$T1
   3316	 vpsllq		\$20,$T3,$T3
   3317	vpaddq		$D1hi,$D2lo,$D2lo
   3318
   3319	vpsrlq		\$42,$D2lo,$tmp
   3320	vpsllq		\$10,$D2hi,$D2hi
   3321	vpandq		$mask42,$D2lo,$H2
   3322	vpaddq		$tmp,$D2hi,$D2hi
   3323
   3324	  vpaddq	$T2,$H2,$H2		# accumulate input
   3325	vpaddq		$D2hi,$H0,$H0
   3326	vpsllq		\$2,$D2hi,$D2hi
   3327
   3328	vpaddq		$D2hi,$H0,$H0
   3329	 vporq		$T3,$T1,$T1
   3330	 vpandq		$mask44,$T1,$T1
   3331
   3332	vpsrlq		\$44,$H0,$tmp		# additional step
   3333	vpandq		$mask44,$H0,$H0
   3334
   3335	vpaddq		$tmp,$H1,$H1
   3336
   3337	sub		\$4,$len		# len-=64
   3338	jnz		.Loop_vpmadd52_4x
   3339
   3340.Ltail_vpmadd52_4x:
   3341	vmovdqu64	128($ctx),$R2		# load all key powers
   3342	vmovdqu64	160($ctx),$S1
   3343	vmovdqu64	64($ctx),$R0
   3344	vmovdqu64	96($ctx),$R1
   3345
   3346.Ltail_vpmadd52_2x:
   3347	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
   3348	vpaddq		$R2,$S2,$S2
   3349	vpsllq		\$2,$S2,$S2
   3350
   3351	#vpaddq		$T2,$H2,$H2		# accumulate input
   3352	vpaddq		$T0,$H0,$H0
   3353	vpaddq		$T1,$H1,$H1
   3354
   3355	vpxorq		$D0lo,$D0lo,$D0lo
   3356	vpmadd52luq	$H2,$S1,$D0lo
   3357	vpxorq		$D0hi,$D0hi,$D0hi
   3358	vpmadd52huq	$H2,$S1,$D0hi
   3359	vpxorq		$D1lo,$D1lo,$D1lo
   3360	vpmadd52luq	$H2,$S2,$D1lo
   3361	vpxorq		$D1hi,$D1hi,$D1hi
   3362	vpmadd52huq	$H2,$S2,$D1hi
   3363	vpxorq		$D2lo,$D2lo,$D2lo
   3364	vpmadd52luq	$H2,$R0,$D2lo
   3365	vpxorq		$D2hi,$D2hi,$D2hi
   3366	vpmadd52huq	$H2,$R0,$D2hi
   3367
   3368	vpmadd52luq	$H0,$R0,$D0lo
   3369	vpmadd52huq	$H0,$R0,$D0hi
   3370	vpmadd52luq	$H0,$R1,$D1lo
   3371	vpmadd52huq	$H0,$R1,$D1hi
   3372	vpmadd52luq	$H0,$R2,$D2lo
   3373	vpmadd52huq	$H0,$R2,$D2hi
   3374
   3375	vpmadd52luq	$H1,$S2,$D0lo
   3376	vpmadd52huq	$H1,$S2,$D0hi
   3377	vpmadd52luq	$H1,$R0,$D1lo
   3378	vpmadd52huq	$H1,$R0,$D1hi
   3379	vpmadd52luq	$H1,$R1,$D2lo
   3380	vpmadd52huq	$H1,$R1,$D2hi
   3381
   3382	################################################################
   3383	# horizontal addition
   3384
   3385	mov		\$1,%eax
   3386	kmovw		%eax,%k1
   3387	vpsrldq		\$8,$D0lo,$T0
   3388	vpsrldq		\$8,$D0hi,$H0
   3389	vpsrldq		\$8,$D1lo,$T1
   3390	vpsrldq		\$8,$D1hi,$H1
   3391	vpaddq		$T0,$D0lo,$D0lo
   3392	vpaddq		$H0,$D0hi,$D0hi
   3393	vpsrldq		\$8,$D2lo,$T2
   3394	vpsrldq		\$8,$D2hi,$H2
   3395	vpaddq		$T1,$D1lo,$D1lo
   3396	vpaddq		$H1,$D1hi,$D1hi
   3397	 vpermq		\$0x2,$D0lo,$T0
   3398	 vpermq		\$0x2,$D0hi,$H0
   3399	vpaddq		$T2,$D2lo,$D2lo
   3400	vpaddq		$H2,$D2hi,$D2hi
   3401
   3402	vpermq		\$0x2,$D1lo,$T1
   3403	vpermq		\$0x2,$D1hi,$H1
   3404	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
   3405	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
   3406	vpermq		\$0x2,$D2lo,$T2
   3407	vpermq		\$0x2,$D2hi,$H2
   3408	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
   3409	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
   3410	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
   3411	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
   3412
   3413	################################################################
   3414	# partial reduction
   3415	vpsrlq		\$44,$D0lo,$tmp
   3416	vpsllq		\$8,$D0hi,$D0hi
   3417	vpandq		$mask44,$D0lo,$H0
   3418	vpaddq		$tmp,$D0hi,$D0hi
   3419
   3420	vpaddq		$D0hi,$D1lo,$D1lo
   3421
   3422	vpsrlq		\$44,$D1lo,$tmp
   3423	vpsllq		\$8,$D1hi,$D1hi
   3424	vpandq		$mask44,$D1lo,$H1
   3425	vpaddq		$tmp,$D1hi,$D1hi
   3426
   3427	vpaddq		$D1hi,$D2lo,$D2lo
   3428
   3429	vpsrlq		\$42,$D2lo,$tmp
   3430	vpsllq		\$10,$D2hi,$D2hi
   3431	vpandq		$mask42,$D2lo,$H2
   3432	vpaddq		$tmp,$D2hi,$D2hi
   3433
   3434	vpaddq		$D2hi,$H0,$H0
   3435	vpsllq		\$2,$D2hi,$D2hi
   3436
   3437	vpaddq		$D2hi,$H0,$H0
   3438
   3439	vpsrlq		\$44,$H0,$tmp		# additional step
   3440	vpandq		$mask44,$H0,$H0
   3441
   3442	vpaddq		$tmp,$H1,$H1
   3443						# at this point $len is
   3444						# either 4*n+2 or 0...
   3445	sub		\$2,$len		# len-=32
   3446	ja		.Lblocks_vpmadd52_4x_do
   3447
   3448	vmovq		%x#$H0,0($ctx)
   3449	vmovq		%x#$H1,8($ctx)
   3450	vmovq		%x#$H2,16($ctx)
   3451	vzeroall
   3452
   3453.Lno_data_vpmadd52_4x:
   3454	RET
   3455.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
   3456___
   3457}
   3458{
   3459########################################################################
   3460# As implied by its name 8x subroutine processes 8 blocks in parallel...
   3461# This is intermediate version, as it's used only in cases when input
   3462# length is either 8*n, 8*n+1 or 8*n+2...
   3463
   3464my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
   3465my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
   3466my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
   3467my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
   3468
   3469$code.=<<___;
   3470.type	poly1305_blocks_vpmadd52_8x,\@function,4
   3471.align	32
   3472poly1305_blocks_vpmadd52_8x:
   3473	shr	\$4,$len
   3474	jz	.Lno_data_vpmadd52_8x		# too short
   3475
   3476	shl	\$40,$padbit
   3477	mov	64($ctx),%r8			# peek on power of the key
   3478
   3479	vmovdqa64	.Lx_mask44(%rip),$mask44
   3480	vmovdqa64	.Lx_mask42(%rip),$mask42
   3481
   3482	test	%r8,%r8				# is power value impossible?
   3483	js	.Linit_vpmadd52			# if it is, then init R[4]
   3484
   3485	vmovq	0($ctx),%x#$H0			# load current hash value
   3486	vmovq	8($ctx),%x#$H1
   3487	vmovq	16($ctx),%x#$H2
   3488
   3489.Lblocks_vpmadd52_8x:
   3490	################################################################
   3491	# fist we calculate more key powers
   3492
   3493	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
   3494	vmovdqu64	160($ctx),$S1
   3495	vmovdqu64	64($ctx),$R0
   3496	vmovdqu64	96($ctx),$R1
   3497
   3498	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
   3499	vpaddq		$R2,$S2,$S2
   3500	vpsllq		\$2,$S2,$S2
   3501
   3502	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
   3503	vpbroadcastq	%x#$R0,$RR0
   3504	vpbroadcastq	%x#$R1,$RR1
   3505
   3506	vpxorq		$D0lo,$D0lo,$D0lo
   3507	vpmadd52luq	$RR2,$S1,$D0lo
   3508	vpxorq		$D0hi,$D0hi,$D0hi
   3509	vpmadd52huq	$RR2,$S1,$D0hi
   3510	vpxorq		$D1lo,$D1lo,$D1lo
   3511	vpmadd52luq	$RR2,$S2,$D1lo
   3512	vpxorq		$D1hi,$D1hi,$D1hi
   3513	vpmadd52huq	$RR2,$S2,$D1hi
   3514	vpxorq		$D2lo,$D2lo,$D2lo
   3515	vpmadd52luq	$RR2,$R0,$D2lo
   3516	vpxorq		$D2hi,$D2hi,$D2hi
   3517	vpmadd52huq	$RR2,$R0,$D2hi
   3518
   3519	vpmadd52luq	$RR0,$R0,$D0lo
   3520	vpmadd52huq	$RR0,$R0,$D0hi
   3521	vpmadd52luq	$RR0,$R1,$D1lo
   3522	vpmadd52huq	$RR0,$R1,$D1hi
   3523	vpmadd52luq	$RR0,$R2,$D2lo
   3524	vpmadd52huq	$RR0,$R2,$D2hi
   3525
   3526	vpmadd52luq	$RR1,$S2,$D0lo
   3527	vpmadd52huq	$RR1,$S2,$D0hi
   3528	vpmadd52luq	$RR1,$R0,$D1lo
   3529	vpmadd52huq	$RR1,$R0,$D1hi
   3530	vpmadd52luq	$RR1,$R1,$D2lo
   3531	vpmadd52huq	$RR1,$R1,$D2hi
   3532
   3533	################################################################
   3534	# partial reduction
   3535	vpsrlq		\$44,$D0lo,$tmp
   3536	vpsllq		\$8,$D0hi,$D0hi
   3537	vpandq		$mask44,$D0lo,$RR0
   3538	vpaddq		$tmp,$D0hi,$D0hi
   3539
   3540	vpaddq		$D0hi,$D1lo,$D1lo
   3541
   3542	vpsrlq		\$44,$D1lo,$tmp
   3543	vpsllq		\$8,$D1hi,$D1hi
   3544	vpandq		$mask44,$D1lo,$RR1
   3545	vpaddq		$tmp,$D1hi,$D1hi
   3546
   3547	vpaddq		$D1hi,$D2lo,$D2lo
   3548
   3549	vpsrlq		\$42,$D2lo,$tmp
   3550	vpsllq		\$10,$D2hi,$D2hi
   3551	vpandq		$mask42,$D2lo,$RR2
   3552	vpaddq		$tmp,$D2hi,$D2hi
   3553
   3554	vpaddq		$D2hi,$RR0,$RR0
   3555	vpsllq		\$2,$D2hi,$D2hi
   3556
   3557	vpaddq		$D2hi,$RR0,$RR0
   3558
   3559	vpsrlq		\$44,$RR0,$tmp		# additional step
   3560	vpandq		$mask44,$RR0,$RR0
   3561
   3562	vpaddq		$tmp,$RR1,$RR1
   3563
   3564	################################################################
   3565	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
   3566	# is 15263748, which reflects how data is loaded...
   3567
   3568	vpunpcklqdq	$R2,$RR2,$T2		# 3748
   3569	vpunpckhqdq	$R2,$RR2,$R2		# 1526
   3570	vpunpcklqdq	$R0,$RR0,$T0
   3571	vpunpckhqdq	$R0,$RR0,$R0
   3572	vpunpcklqdq	$R1,$RR1,$T1
   3573	vpunpckhqdq	$R1,$RR1,$R1
   3574___
   3575######## switch to %zmm
   3576map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
   3577map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
   3578map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
   3579map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
   3580
   3581$code.=<<___;
   3582	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
   3583	vshufi64x2	\$0x44,$R0,$T0,$RR0
   3584	vshufi64x2	\$0x44,$R1,$T1,$RR1
   3585
   3586	vmovdqu64	16*0($inp),$T2		# load data
   3587	vmovdqu64	16*4($inp),$T3
   3588	lea		16*8($inp),$inp
   3589
   3590	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
   3591	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
   3592	vpaddq		$RR2,$SS2,$SS2
   3593	vpaddq		$RR1,$SS1,$SS1
   3594	vpsllq		\$2,$SS2,$SS2
   3595	vpsllq		\$2,$SS1,$SS1
   3596
   3597	vpbroadcastq	$padbit,$PAD
   3598	vpbroadcastq	%x#$mask44,$mask44
   3599	vpbroadcastq	%x#$mask42,$mask42
   3600
   3601	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
   3602	vpbroadcastq	%x#$SS2,$S2
   3603	vpbroadcastq	%x#$RR0,$R0
   3604	vpbroadcastq	%x#$RR1,$R1
   3605	vpbroadcastq	%x#$RR2,$R2
   3606
   3607	vpunpcklqdq	$T3,$T2,$T1		# transpose data
   3608	vpunpckhqdq	$T3,$T2,$T3
   3609
   3610	# at this point 64-bit lanes are ordered as 73625140
   3611
   3612	vpsrlq		\$24,$T3,$T2		# splat the data
   3613	vporq		$PAD,$T2,$T2
   3614	 vpaddq		$T2,$H2,$H2		# accumulate input
   3615	vpandq		$mask44,$T1,$T0
   3616	vpsrlq		\$44,$T1,$T1
   3617	vpsllq		\$20,$T3,$T3
   3618	vporq		$T3,$T1,$T1
   3619	vpandq		$mask44,$T1,$T1
   3620
   3621	sub		\$8,$len
   3622	jz		.Ltail_vpmadd52_8x
   3623	jmp		.Loop_vpmadd52_8x
   3624
   3625.align	32
   3626.Loop_vpmadd52_8x:
   3627	#vpaddq		$T2,$H2,$H2		# accumulate input
   3628	vpaddq		$T0,$H0,$H0
   3629	vpaddq		$T1,$H1,$H1
   3630
   3631	vpxorq		$D0lo,$D0lo,$D0lo
   3632	vpmadd52luq	$H2,$S1,$D0lo
   3633	vpxorq		$D0hi,$D0hi,$D0hi
   3634	vpmadd52huq	$H2,$S1,$D0hi
   3635	vpxorq		$D1lo,$D1lo,$D1lo
   3636	vpmadd52luq	$H2,$S2,$D1lo
   3637	vpxorq		$D1hi,$D1hi,$D1hi
   3638	vpmadd52huq	$H2,$S2,$D1hi
   3639	vpxorq		$D2lo,$D2lo,$D2lo
   3640	vpmadd52luq	$H2,$R0,$D2lo
   3641	vpxorq		$D2hi,$D2hi,$D2hi
   3642	vpmadd52huq	$H2,$R0,$D2hi
   3643
   3644	 vmovdqu64	16*0($inp),$T2		# load data
   3645	 vmovdqu64	16*4($inp),$T3
   3646	 lea		16*8($inp),$inp
   3647	vpmadd52luq	$H0,$R0,$D0lo
   3648	vpmadd52huq	$H0,$R0,$D0hi
   3649	vpmadd52luq	$H0,$R1,$D1lo
   3650	vpmadd52huq	$H0,$R1,$D1hi
   3651	vpmadd52luq	$H0,$R2,$D2lo
   3652	vpmadd52huq	$H0,$R2,$D2hi
   3653
   3654	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
   3655	 vpunpckhqdq	$T3,$T2,$T3
   3656	vpmadd52luq	$H1,$S2,$D0lo
   3657	vpmadd52huq	$H1,$S2,$D0hi
   3658	vpmadd52luq	$H1,$R0,$D1lo
   3659	vpmadd52huq	$H1,$R0,$D1hi
   3660	vpmadd52luq	$H1,$R1,$D2lo
   3661	vpmadd52huq	$H1,$R1,$D2hi
   3662
   3663	################################################################
   3664	# partial reduction (interleaved with data splat)
   3665	vpsrlq		\$44,$D0lo,$tmp
   3666	vpsllq		\$8,$D0hi,$D0hi
   3667	vpandq		$mask44,$D0lo,$H0
   3668	vpaddq		$tmp,$D0hi,$D0hi
   3669
   3670	 vpsrlq		\$24,$T3,$T2
   3671	 vporq		$PAD,$T2,$T2
   3672	vpaddq		$D0hi,$D1lo,$D1lo
   3673
   3674	vpsrlq		\$44,$D1lo,$tmp
   3675	vpsllq		\$8,$D1hi,$D1hi
   3676	vpandq		$mask44,$D1lo,$H1
   3677	vpaddq		$tmp,$D1hi,$D1hi
   3678
   3679	 vpandq		$mask44,$T1,$T0
   3680	 vpsrlq		\$44,$T1,$T1
   3681	 vpsllq		\$20,$T3,$T3
   3682	vpaddq		$D1hi,$D2lo,$D2lo
   3683
   3684	vpsrlq		\$42,$D2lo,$tmp
   3685	vpsllq		\$10,$D2hi,$D2hi
   3686	vpandq		$mask42,$D2lo,$H2
   3687	vpaddq		$tmp,$D2hi,$D2hi
   3688
   3689	  vpaddq	$T2,$H2,$H2		# accumulate input
   3690	vpaddq		$D2hi,$H0,$H0
   3691	vpsllq		\$2,$D2hi,$D2hi
   3692
   3693	vpaddq		$D2hi,$H0,$H0
   3694	 vporq		$T3,$T1,$T1
   3695	 vpandq		$mask44,$T1,$T1
   3696
   3697	vpsrlq		\$44,$H0,$tmp		# additional step
   3698	vpandq		$mask44,$H0,$H0
   3699
   3700	vpaddq		$tmp,$H1,$H1
   3701
   3702	sub		\$8,$len		# len-=128
   3703	jnz		.Loop_vpmadd52_8x
   3704
   3705.Ltail_vpmadd52_8x:
   3706	#vpaddq		$T2,$H2,$H2		# accumulate input
   3707	vpaddq		$T0,$H0,$H0
   3708	vpaddq		$T1,$H1,$H1
   3709
   3710	vpxorq		$D0lo,$D0lo,$D0lo
   3711	vpmadd52luq	$H2,$SS1,$D0lo
   3712	vpxorq		$D0hi,$D0hi,$D0hi
   3713	vpmadd52huq	$H2,$SS1,$D0hi
   3714	vpxorq		$D1lo,$D1lo,$D1lo
   3715	vpmadd52luq	$H2,$SS2,$D1lo
   3716	vpxorq		$D1hi,$D1hi,$D1hi
   3717	vpmadd52huq	$H2,$SS2,$D1hi
   3718	vpxorq		$D2lo,$D2lo,$D2lo
   3719	vpmadd52luq	$H2,$RR0,$D2lo
   3720	vpxorq		$D2hi,$D2hi,$D2hi
   3721	vpmadd52huq	$H2,$RR0,$D2hi
   3722
   3723	vpmadd52luq	$H0,$RR0,$D0lo
   3724	vpmadd52huq	$H0,$RR0,$D0hi
   3725	vpmadd52luq	$H0,$RR1,$D1lo
   3726	vpmadd52huq	$H0,$RR1,$D1hi
   3727	vpmadd52luq	$H0,$RR2,$D2lo
   3728	vpmadd52huq	$H0,$RR2,$D2hi
   3729
   3730	vpmadd52luq	$H1,$SS2,$D0lo
   3731	vpmadd52huq	$H1,$SS2,$D0hi
   3732	vpmadd52luq	$H1,$RR0,$D1lo
   3733	vpmadd52huq	$H1,$RR0,$D1hi
   3734	vpmadd52luq	$H1,$RR1,$D2lo
   3735	vpmadd52huq	$H1,$RR1,$D2hi
   3736
   3737	################################################################
   3738	# horizontal addition
   3739
   3740	mov		\$1,%eax
   3741	kmovw		%eax,%k1
   3742	vpsrldq		\$8,$D0lo,$T0
   3743	vpsrldq		\$8,$D0hi,$H0
   3744	vpsrldq		\$8,$D1lo,$T1
   3745	vpsrldq		\$8,$D1hi,$H1
   3746	vpaddq		$T0,$D0lo,$D0lo
   3747	vpaddq		$H0,$D0hi,$D0hi
   3748	vpsrldq		\$8,$D2lo,$T2
   3749	vpsrldq		\$8,$D2hi,$H2
   3750	vpaddq		$T1,$D1lo,$D1lo
   3751	vpaddq		$H1,$D1hi,$D1hi
   3752	 vpermq		\$0x2,$D0lo,$T0
   3753	 vpermq		\$0x2,$D0hi,$H0
   3754	vpaddq		$T2,$D2lo,$D2lo
   3755	vpaddq		$H2,$D2hi,$D2hi
   3756
   3757	vpermq		\$0x2,$D1lo,$T1
   3758	vpermq		\$0x2,$D1hi,$H1
   3759	vpaddq		$T0,$D0lo,$D0lo
   3760	vpaddq		$H0,$D0hi,$D0hi
   3761	vpermq		\$0x2,$D2lo,$T2
   3762	vpermq		\$0x2,$D2hi,$H2
   3763	vpaddq		$T1,$D1lo,$D1lo
   3764	vpaddq		$H1,$D1hi,$D1hi
   3765	 vextracti64x4	\$1,$D0lo,%y#$T0
   3766	 vextracti64x4	\$1,$D0hi,%y#$H0
   3767	vpaddq		$T2,$D2lo,$D2lo
   3768	vpaddq		$H2,$D2hi,$D2hi
   3769
   3770	vextracti64x4	\$1,$D1lo,%y#$T1
   3771	vextracti64x4	\$1,$D1hi,%y#$H1
   3772	vextracti64x4	\$1,$D2lo,%y#$T2
   3773	vextracti64x4	\$1,$D2hi,%y#$H2
   3774___
   3775######## switch back to %ymm
   3776map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
   3777map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
   3778map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
   3779
   3780$code.=<<___;
   3781	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
   3782	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
   3783	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
   3784	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
   3785	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
   3786	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
   3787
   3788	################################################################
   3789	# partial reduction
   3790	vpsrlq		\$44,$D0lo,$tmp
   3791	vpsllq		\$8,$D0hi,$D0hi
   3792	vpandq		$mask44,$D0lo,$H0
   3793	vpaddq		$tmp,$D0hi,$D0hi
   3794
   3795	vpaddq		$D0hi,$D1lo,$D1lo
   3796
   3797	vpsrlq		\$44,$D1lo,$tmp
   3798	vpsllq		\$8,$D1hi,$D1hi
   3799	vpandq		$mask44,$D1lo,$H1
   3800	vpaddq		$tmp,$D1hi,$D1hi
   3801
   3802	vpaddq		$D1hi,$D2lo,$D2lo
   3803
   3804	vpsrlq		\$42,$D2lo,$tmp
   3805	vpsllq		\$10,$D2hi,$D2hi
   3806	vpandq		$mask42,$D2lo,$H2
   3807	vpaddq		$tmp,$D2hi,$D2hi
   3808
   3809	vpaddq		$D2hi,$H0,$H0
   3810	vpsllq		\$2,$D2hi,$D2hi
   3811
   3812	vpaddq		$D2hi,$H0,$H0
   3813
   3814	vpsrlq		\$44,$H0,$tmp		# additional step
   3815	vpandq		$mask44,$H0,$H0
   3816
   3817	vpaddq		$tmp,$H1,$H1
   3818
   3819	################################################################
   3820
   3821	vmovq		%x#$H0,0($ctx)
   3822	vmovq		%x#$H1,8($ctx)
   3823	vmovq		%x#$H2,16($ctx)
   3824	vzeroall
   3825
   3826.Lno_data_vpmadd52_8x:
   3827	RET
   3828.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
   3829___
   3830}
   3831$code.=<<___;
   3832.type	poly1305_emit_base2_44,\@function,3
   3833.align	32
   3834poly1305_emit_base2_44:
   3835	mov	0($ctx),%r8	# load hash value
   3836	mov	8($ctx),%r9
   3837	mov	16($ctx),%r10
   3838
   3839	mov	%r9,%rax
   3840	shr	\$20,%r9
   3841	shl	\$44,%rax
   3842	mov	%r10,%rcx
   3843	shr	\$40,%r10
   3844	shl	\$24,%rcx
   3845
   3846	add	%rax,%r8
   3847	adc	%rcx,%r9
   3848	adc	\$0,%r10
   3849
   3850	mov	%r8,%rax
   3851	add	\$5,%r8		# compare to modulus
   3852	mov	%r9,%rcx
   3853	adc	\$0,%r9
   3854	adc	\$0,%r10
   3855	shr	\$2,%r10	# did 130-bit value overflow?
   3856	cmovnz	%r8,%rax
   3857	cmovnz	%r9,%rcx
   3858
   3859	add	0($nonce),%rax	# accumulate nonce
   3860	adc	8($nonce),%rcx
   3861	mov	%rax,0($mac)	# write result
   3862	mov	%rcx,8($mac)
   3863
   3864	RET
   3865.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
   3866___
   3867}	}	}
   3868}
   3869
   3870if (!$kernel)
   3871{	# chacha20-poly1305 helpers
   3872my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
   3873                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
   3874$code.=<<___;
   3875.globl	xor128_encrypt_n_pad
   3876.type	xor128_encrypt_n_pad,\@abi-omnipotent
   3877.align	16
   3878xor128_encrypt_n_pad:
   3879	sub	$otp,$inp
   3880	sub	$otp,$out
   3881	mov	$len,%r10		# put len aside
   3882	shr	\$4,$len		# len / 16
   3883	jz	.Ltail_enc
   3884	nop
   3885.Loop_enc_xmm:
   3886	movdqu	($inp,$otp),%xmm0
   3887	pxor	($otp),%xmm0
   3888	movdqu	%xmm0,($out,$otp)
   3889	movdqa	%xmm0,($otp)
   3890	lea	16($otp),$otp
   3891	dec	$len
   3892	jnz	.Loop_enc_xmm
   3893
   3894	and	\$15,%r10		# len % 16
   3895	jz	.Ldone_enc
   3896
   3897.Ltail_enc:
   3898	mov	\$16,$len
   3899	sub	%r10,$len
   3900	xor	%eax,%eax
   3901.Loop_enc_byte:
   3902	mov	($inp,$otp),%al
   3903	xor	($otp),%al
   3904	mov	%al,($out,$otp)
   3905	mov	%al,($otp)
   3906	lea	1($otp),$otp
   3907	dec	%r10
   3908	jnz	.Loop_enc_byte
   3909
   3910	xor	%eax,%eax
   3911.Loop_enc_pad:
   3912	mov	%al,($otp)
   3913	lea	1($otp),$otp
   3914	dec	$len
   3915	jnz	.Loop_enc_pad
   3916
   3917.Ldone_enc:
   3918	mov	$otp,%rax
   3919	RET
   3920.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
   3921
   3922.globl	xor128_decrypt_n_pad
   3923.type	xor128_decrypt_n_pad,\@abi-omnipotent
   3924.align	16
   3925xor128_decrypt_n_pad:
   3926	sub	$otp,$inp
   3927	sub	$otp,$out
   3928	mov	$len,%r10		# put len aside
   3929	shr	\$4,$len		# len / 16
   3930	jz	.Ltail_dec
   3931	nop
   3932.Loop_dec_xmm:
   3933	movdqu	($inp,$otp),%xmm0
   3934	movdqa	($otp),%xmm1
   3935	pxor	%xmm0,%xmm1
   3936	movdqu	%xmm1,($out,$otp)
   3937	movdqa	%xmm0,($otp)
   3938	lea	16($otp),$otp
   3939	dec	$len
   3940	jnz	.Loop_dec_xmm
   3941
   3942	pxor	%xmm1,%xmm1
   3943	and	\$15,%r10		# len % 16
   3944	jz	.Ldone_dec
   3945
   3946.Ltail_dec:
   3947	mov	\$16,$len
   3948	sub	%r10,$len
   3949	xor	%eax,%eax
   3950	xor	%r11d,%r11d
   3951.Loop_dec_byte:
   3952	mov	($inp,$otp),%r11b
   3953	mov	($otp),%al
   3954	xor	%r11b,%al
   3955	mov	%al,($out,$otp)
   3956	mov	%r11b,($otp)
   3957	lea	1($otp),$otp
   3958	dec	%r10
   3959	jnz	.Loop_dec_byte
   3960
   3961	xor	%eax,%eax
   3962.Loop_dec_pad:
   3963	mov	%al,($otp)
   3964	lea	1($otp),$otp
   3965	dec	$len
   3966	jnz	.Loop_dec_pad
   3967
   3968.Ldone_dec:
   3969	mov	$otp,%rax
   3970	RET
   3971.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
   3972___
   3973}
   3974
   3975# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   3976#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   3977if ($win64) {
   3978$rec="%rcx";
   3979$frame="%rdx";
   3980$context="%r8";
   3981$disp="%r9";
   3982
   3983$code.=<<___;
   3984.extern	__imp_RtlVirtualUnwind
   3985.type	se_handler,\@abi-omnipotent
   3986.align	16
   3987se_handler:
   3988	push	%rsi
   3989	push	%rdi
   3990	push	%rbx
   3991	push	%rbp
   3992	push	%r12
   3993	push	%r13
   3994	push	%r14
   3995	push	%r15
   3996	pushfq
   3997	sub	\$64,%rsp
   3998
   3999	mov	120($context),%rax	# pull context->Rax
   4000	mov	248($context),%rbx	# pull context->Rip
   4001
   4002	mov	8($disp),%rsi		# disp->ImageBase
   4003	mov	56($disp),%r11		# disp->HandlerData
   4004
   4005	mov	0(%r11),%r10d		# HandlerData[0]
   4006	lea	(%rsi,%r10),%r10	# prologue label
   4007	cmp	%r10,%rbx		# context->Rip<.Lprologue
   4008	jb	.Lcommon_seh_tail
   4009
   4010	mov	152($context),%rax	# pull context->Rsp
   4011
   4012	mov	4(%r11),%r10d		# HandlerData[1]
   4013	lea	(%rsi,%r10),%r10	# epilogue label
   4014	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
   4015	jae	.Lcommon_seh_tail
   4016
   4017	lea	48(%rax),%rax
   4018
   4019	mov	-8(%rax),%rbx
   4020	mov	-16(%rax),%rbp
   4021	mov	-24(%rax),%r12
   4022	mov	-32(%rax),%r13
   4023	mov	-40(%rax),%r14
   4024	mov	-48(%rax),%r15
   4025	mov	%rbx,144($context)	# restore context->Rbx
   4026	mov	%rbp,160($context)	# restore context->Rbp
   4027	mov	%r12,216($context)	# restore context->R12
   4028	mov	%r13,224($context)	# restore context->R13
   4029	mov	%r14,232($context)	# restore context->R14
   4030	mov	%r15,240($context)	# restore context->R14
   4031
   4032	jmp	.Lcommon_seh_tail
   4033.size	se_handler,.-se_handler
   4034
   4035.type	avx_handler,\@abi-omnipotent
   4036.align	16
   4037avx_handler:
   4038	push	%rsi
   4039	push	%rdi
   4040	push	%rbx
   4041	push	%rbp
   4042	push	%r12
   4043	push	%r13
   4044	push	%r14
   4045	push	%r15
   4046	pushfq
   4047	sub	\$64,%rsp
   4048
   4049	mov	120($context),%rax	# pull context->Rax
   4050	mov	248($context),%rbx	# pull context->Rip
   4051
   4052	mov	8($disp),%rsi		# disp->ImageBase
   4053	mov	56($disp),%r11		# disp->HandlerData
   4054
   4055	mov	0(%r11),%r10d		# HandlerData[0]
   4056	lea	(%rsi,%r10),%r10	# prologue label
   4057	cmp	%r10,%rbx		# context->Rip<prologue label
   4058	jb	.Lcommon_seh_tail
   4059
   4060	mov	152($context),%rax	# pull context->Rsp
   4061
   4062	mov	4(%r11),%r10d		# HandlerData[1]
   4063	lea	(%rsi,%r10),%r10	# epilogue label
   4064	cmp	%r10,%rbx		# context->Rip>=epilogue label
   4065	jae	.Lcommon_seh_tail
   4066
   4067	mov	208($context),%rax	# pull context->R11
   4068
   4069	lea	0x50(%rax),%rsi
   4070	lea	0xf8(%rax),%rax
   4071	lea	512($context),%rdi	# &context.Xmm6
   4072	mov	\$20,%ecx
   4073	.long	0xa548f3fc		# cld; rep movsq
   4074
   4075.Lcommon_seh_tail:
   4076	mov	8(%rax),%rdi
   4077	mov	16(%rax),%rsi
   4078	mov	%rax,152($context)	# restore context->Rsp
   4079	mov	%rsi,168($context)	# restore context->Rsi
   4080	mov	%rdi,176($context)	# restore context->Rdi
   4081
   4082	mov	40($disp),%rdi		# disp->ContextRecord
   4083	mov	$context,%rsi		# context
   4084	mov	\$154,%ecx		# sizeof(CONTEXT)
   4085	.long	0xa548f3fc		# cld; rep movsq
   4086
   4087	mov	$disp,%rsi
   4088	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
   4089	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   4090	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   4091	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   4092	mov	40(%rsi),%r10		# disp->ContextRecord
   4093	lea	56(%rsi),%r11		# &disp->HandlerData
   4094	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   4095	mov	%r10,32(%rsp)		# arg5
   4096	mov	%r11,40(%rsp)		# arg6
   4097	mov	%r12,48(%rsp)		# arg7
   4098	mov	%rcx,56(%rsp)		# arg8, (NULL)
   4099	call	*__imp_RtlVirtualUnwind(%rip)
   4100
   4101	mov	\$1,%eax		# ExceptionContinueSearch
   4102	add	\$64,%rsp
   4103	popfq
   4104	pop	%r15
   4105	pop	%r14
   4106	pop	%r13
   4107	pop	%r12
   4108	pop	%rbp
   4109	pop	%rbx
   4110	pop	%rdi
   4111	pop	%rsi
   4112	RET
   4113.size	avx_handler,.-avx_handler
   4114
   4115.section	.pdata
   4116.align	4
   4117	.rva	.LSEH_begin_poly1305_init_x86_64
   4118	.rva	.LSEH_end_poly1305_init_x86_64
   4119	.rva	.LSEH_info_poly1305_init_x86_64
   4120
   4121	.rva	.LSEH_begin_poly1305_blocks_x86_64
   4122	.rva	.LSEH_end_poly1305_blocks_x86_64
   4123	.rva	.LSEH_info_poly1305_blocks_x86_64
   4124
   4125	.rva	.LSEH_begin_poly1305_emit_x86_64
   4126	.rva	.LSEH_end_poly1305_emit_x86_64
   4127	.rva	.LSEH_info_poly1305_emit_x86_64
   4128___
   4129$code.=<<___ if ($avx);
   4130	.rva	.LSEH_begin_poly1305_blocks_avx
   4131	.rva	.Lbase2_64_avx
   4132	.rva	.LSEH_info_poly1305_blocks_avx_1
   4133
   4134	.rva	.Lbase2_64_avx
   4135	.rva	.Leven_avx
   4136	.rva	.LSEH_info_poly1305_blocks_avx_2
   4137
   4138	.rva	.Leven_avx
   4139	.rva	.LSEH_end_poly1305_blocks_avx
   4140	.rva	.LSEH_info_poly1305_blocks_avx_3
   4141
   4142	.rva	.LSEH_begin_poly1305_emit_avx
   4143	.rva	.LSEH_end_poly1305_emit_avx
   4144	.rva	.LSEH_info_poly1305_emit_avx
   4145___
   4146$code.=<<___ if ($avx>1);
   4147	.rva	.LSEH_begin_poly1305_blocks_avx2
   4148	.rva	.Lbase2_64_avx2
   4149	.rva	.LSEH_info_poly1305_blocks_avx2_1
   4150
   4151	.rva	.Lbase2_64_avx2
   4152	.rva	.Leven_avx2
   4153	.rva	.LSEH_info_poly1305_blocks_avx2_2
   4154
   4155	.rva	.Leven_avx2
   4156	.rva	.LSEH_end_poly1305_blocks_avx2
   4157	.rva	.LSEH_info_poly1305_blocks_avx2_3
   4158___
   4159$code.=<<___ if ($avx>2);
   4160	.rva	.LSEH_begin_poly1305_blocks_avx512
   4161	.rva	.LSEH_end_poly1305_blocks_avx512
   4162	.rva	.LSEH_info_poly1305_blocks_avx512
   4163___
   4164$code.=<<___;
   4165.section	.xdata
   4166.align	8
   4167.LSEH_info_poly1305_init_x86_64:
   4168	.byte	9,0,0,0
   4169	.rva	se_handler
   4170	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
   4171
   4172.LSEH_info_poly1305_blocks_x86_64:
   4173	.byte	9,0,0,0
   4174	.rva	se_handler
   4175	.rva	.Lblocks_body,.Lblocks_epilogue
   4176
   4177.LSEH_info_poly1305_emit_x86_64:
   4178	.byte	9,0,0,0
   4179	.rva	se_handler
   4180	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
   4181___
   4182$code.=<<___ if ($avx);
   4183.LSEH_info_poly1305_blocks_avx_1:
   4184	.byte	9,0,0,0
   4185	.rva	se_handler
   4186	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
   4187
   4188.LSEH_info_poly1305_blocks_avx_2:
   4189	.byte	9,0,0,0
   4190	.rva	se_handler
   4191	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
   4192
   4193.LSEH_info_poly1305_blocks_avx_3:
   4194	.byte	9,0,0,0
   4195	.rva	avx_handler
   4196	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
   4197
   4198.LSEH_info_poly1305_emit_avx:
   4199	.byte	9,0,0,0
   4200	.rva	se_handler
   4201	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
   4202___
   4203$code.=<<___ if ($avx>1);
   4204.LSEH_info_poly1305_blocks_avx2_1:
   4205	.byte	9,0,0,0
   4206	.rva	se_handler
   4207	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
   4208
   4209.LSEH_info_poly1305_blocks_avx2_2:
   4210	.byte	9,0,0,0
   4211	.rva	se_handler
   4212	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
   4213
   4214.LSEH_info_poly1305_blocks_avx2_3:
   4215	.byte	9,0,0,0
   4216	.rva	avx_handler
   4217	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
   4218___
   4219$code.=<<___ if ($avx>2);
   4220.LSEH_info_poly1305_blocks_avx512:
   4221	.byte	9,0,0,0
   4222	.rva	avx_handler
   4223	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
   4224___
   4225}
   4226
   4227open SELF,$0;
   4228while(<SELF>) {
   4229	next if (/^#!/);
   4230	last if (!s/^#/\/\// and !/^$/);
   4231	print;
   4232}
   4233close SELF;
   4234
   4235foreach (split('\n',$code)) {
   4236	s/\`([^\`]*)\`/eval($1)/ge;
   4237	s/%r([a-z]+)#d/%e$1/g;
   4238	s/%r([0-9]+)#d/%r$1d/g;
   4239	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
   4240
   4241	if ($kernel) {
   4242		s/(^\.type.*),[0-9]+$/\1/;
   4243		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
   4244		next if /^\.cfi.*/;
   4245	}
   4246
   4247	print $_,"\n";
   4248}
   4249close STDOUT;