cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

poly1305-armv4.pl (29431B)


      1#!/usr/bin/env perl
      2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
      3#
      4# ====================================================================
      5# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
      6# project.
      7# ====================================================================
      8#
      9#			IALU(*)/gcc-4.4		NEON
     10#
     11# ARM11xx(ARMv6)	7.78/+100%		-
     12# Cortex-A5		6.35/+130%		3.00
     13# Cortex-A8		6.25/+115%		2.36
     14# Cortex-A9		5.10/+95%		2.55
     15# Cortex-A15		3.85/+85%		1.25(**)
     16# Snapdragon S4		5.70/+100%		1.48(**)
     17#
     18# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
     19# (**)	these are trade-off results, they can be improved by ~8% but at
     20#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
     21#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
     22
     23$flavour = shift;
     24if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
     25else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
     26
     27if ($flavour && $flavour ne "void") {
     28    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     29    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     30    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     31    die "can't locate arm-xlate.pl";
     32
     33    open STDOUT,"| \"$^X\" $xlate $flavour $output";
     34} else {
     35    open STDOUT,">$output";
     36}
     37
     38($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
     39
     40$code.=<<___;
     41#ifndef	__KERNEL__
     42# include "arm_arch.h"
     43#else
     44# define __ARM_ARCH__ __LINUX_ARM_ARCH__
     45# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
     46# define poly1305_init   poly1305_init_arm
     47# define poly1305_blocks poly1305_blocks_arm
     48# define poly1305_emit   poly1305_emit_arm
     49.globl	poly1305_blocks_neon
     50#endif
     51
     52#if defined(__thumb2__)
     53.syntax	unified
     54.thumb
     55#else
     56.code	32
     57#endif
     58
     59.text
     60
     61.globl	poly1305_emit
     62.globl	poly1305_blocks
     63.globl	poly1305_init
     64.type	poly1305_init,%function
     65.align	5
     66poly1305_init:
     67.Lpoly1305_init:
     68	stmdb	sp!,{r4-r11}
     69
     70	eor	r3,r3,r3
     71	cmp	$inp,#0
     72	str	r3,[$ctx,#0]		@ zero hash value
     73	str	r3,[$ctx,#4]
     74	str	r3,[$ctx,#8]
     75	str	r3,[$ctx,#12]
     76	str	r3,[$ctx,#16]
     77	str	r3,[$ctx,#36]		@ clear is_base2_26
     78	add	$ctx,$ctx,#20
     79
     80#ifdef	__thumb2__
     81	it	eq
     82#endif
     83	moveq	r0,#0
     84	beq	.Lno_key
     85
     86#if	__ARM_MAX_ARCH__>=7
     87	mov	r3,#-1
     88	str	r3,[$ctx,#28]		@ impossible key power value
     89# ifndef __KERNEL__
     90	adr	r11,.Lpoly1305_init
     91	ldr	r12,.LOPENSSL_armcap
     92# endif
     93#endif
     94	ldrb	r4,[$inp,#0]
     95	mov	r10,#0x0fffffff
     96	ldrb	r5,[$inp,#1]
     97	and	r3,r10,#-4		@ 0x0ffffffc
     98	ldrb	r6,[$inp,#2]
     99	ldrb	r7,[$inp,#3]
    100	orr	r4,r4,r5,lsl#8
    101	ldrb	r5,[$inp,#4]
    102	orr	r4,r4,r6,lsl#16
    103	ldrb	r6,[$inp,#5]
    104	orr	r4,r4,r7,lsl#24
    105	ldrb	r7,[$inp,#6]
    106	and	r4,r4,r10
    107
    108#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    109# if !defined(_WIN32)
    110	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
    111# endif
    112# if defined(__APPLE__) || defined(_WIN32)
    113	ldr	r12,[r12]
    114# endif
    115#endif
    116	ldrb	r8,[$inp,#7]
    117	orr	r5,r5,r6,lsl#8
    118	ldrb	r6,[$inp,#8]
    119	orr	r5,r5,r7,lsl#16
    120	ldrb	r7,[$inp,#9]
    121	orr	r5,r5,r8,lsl#24
    122	ldrb	r8,[$inp,#10]
    123	and	r5,r5,r3
    124
    125#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    126	tst	r12,#ARMV7_NEON		@ check for NEON
    127# ifdef	__thumb2__
    128	adr	r9,.Lpoly1305_blocks_neon
    129	adr	r11,.Lpoly1305_blocks
    130	it	ne
    131	movne	r11,r9
    132	adr	r12,.Lpoly1305_emit
    133	orr	r11,r11,#1		@ thumb-ify addresses
    134	orr	r12,r12,#1
    135# else
    136	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
    137	ite	eq
    138	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
    139	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
    140# endif
    141#endif
    142	ldrb	r9,[$inp,#11]
    143	orr	r6,r6,r7,lsl#8
    144	ldrb	r7,[$inp,#12]
    145	orr	r6,r6,r8,lsl#16
    146	ldrb	r8,[$inp,#13]
    147	orr	r6,r6,r9,lsl#24
    148	ldrb	r9,[$inp,#14]
    149	and	r6,r6,r3
    150
    151	ldrb	r10,[$inp,#15]
    152	orr	r7,r7,r8,lsl#8
    153	str	r4,[$ctx,#0]
    154	orr	r7,r7,r9,lsl#16
    155	str	r5,[$ctx,#4]
    156	orr	r7,r7,r10,lsl#24
    157	str	r6,[$ctx,#8]
    158	and	r7,r7,r3
    159	str	r7,[$ctx,#12]
    160#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    161	stmia	r2,{r11,r12}		@ fill functions table
    162	mov	r0,#1
    163#else
    164	mov	r0,#0
    165#endif
    166.Lno_key:
    167	ldmia	sp!,{r4-r11}
    168#if	__ARM_ARCH__>=5
    169	ret				@ bx	lr
    170#else
    171	tst	lr,#1
    172	moveq	pc,lr			@ be binary compatible with V4, yet
    173	bx	lr			@ interoperable with Thumb ISA:-)
    174#endif
    175.size	poly1305_init,.-poly1305_init
    176___
    177{
    178my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
    179my ($s1,$s2,$s3)=($r1,$r2,$r3);
    180
    181$code.=<<___;
    182.type	poly1305_blocks,%function
    183.align	5
    184poly1305_blocks:
    185.Lpoly1305_blocks:
    186	stmdb	sp!,{r3-r11,lr}
    187
    188	ands	$len,$len,#-16
    189	beq	.Lno_data
    190
    191	add	$len,$len,$inp		@ end pointer
    192	sub	sp,sp,#32
    193
    194#if __ARM_ARCH__<7
    195	ldmia	$ctx,{$h0-$r3}		@ load context
    196	add	$ctx,$ctx,#20
    197	str	$len,[sp,#16]		@ offload stuff
    198	str	$ctx,[sp,#12]
    199#else
    200	ldr	lr,[$ctx,#36]		@ is_base2_26
    201	ldmia	$ctx!,{$h0-$h4}		@ load hash value
    202	str	$len,[sp,#16]		@ offload stuff
    203	str	$ctx,[sp,#12]
    204
    205	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
    206	mov	$r1,$h1,lsr#6
    207	adcs	$r1,$r1,$h2,lsl#20
    208	mov	$r2,$h2,lsr#12
    209	adcs	$r2,$r2,$h3,lsl#14
    210	mov	$r3,$h3,lsr#18
    211	adcs	$r3,$r3,$h4,lsl#8
    212	mov	$len,#0
    213	teq	lr,#0
    214	str	$len,[$ctx,#16]		@ clear is_base2_26
    215	adc	$len,$len,$h4,lsr#24
    216
    217	itttt	ne
    218	movne	$h0,$r0			@ choose between radixes
    219	movne	$h1,$r1
    220	movne	$h2,$r2
    221	movne	$h3,$r3
    222	ldmia	$ctx,{$r0-$r3}		@ load key
    223	it	ne
    224	movne	$h4,$len
    225#endif
    226
    227	mov	lr,$inp
    228	cmp	$padbit,#0
    229	str	$r1,[sp,#20]
    230	str	$r2,[sp,#24]
    231	str	$r3,[sp,#28]
    232	b	.Loop
    233
    234.align	4
    235.Loop:
    236#if __ARM_ARCH__<7
    237	ldrb	r0,[lr],#16		@ load input
    238# ifdef	__thumb2__
    239	it	hi
    240# endif
    241	addhi	$h4,$h4,#1		@ 1<<128
    242	ldrb	r1,[lr,#-15]
    243	ldrb	r2,[lr,#-14]
    244	ldrb	r3,[lr,#-13]
    245	orr	r1,r0,r1,lsl#8
    246	ldrb	r0,[lr,#-12]
    247	orr	r2,r1,r2,lsl#16
    248	ldrb	r1,[lr,#-11]
    249	orr	r3,r2,r3,lsl#24
    250	ldrb	r2,[lr,#-10]
    251	adds	$h0,$h0,r3		@ accumulate input
    252
    253	ldrb	r3,[lr,#-9]
    254	orr	r1,r0,r1,lsl#8
    255	ldrb	r0,[lr,#-8]
    256	orr	r2,r1,r2,lsl#16
    257	ldrb	r1,[lr,#-7]
    258	orr	r3,r2,r3,lsl#24
    259	ldrb	r2,[lr,#-6]
    260	adcs	$h1,$h1,r3
    261
    262	ldrb	r3,[lr,#-5]
    263	orr	r1,r0,r1,lsl#8
    264	ldrb	r0,[lr,#-4]
    265	orr	r2,r1,r2,lsl#16
    266	ldrb	r1,[lr,#-3]
    267	orr	r3,r2,r3,lsl#24
    268	ldrb	r2,[lr,#-2]
    269	adcs	$h2,$h2,r3
    270
    271	ldrb	r3,[lr,#-1]
    272	orr	r1,r0,r1,lsl#8
    273	str	lr,[sp,#8]		@ offload input pointer
    274	orr	r2,r1,r2,lsl#16
    275	add	$s1,$r1,$r1,lsr#2
    276	orr	r3,r2,r3,lsl#24
    277#else
    278	ldr	r0,[lr],#16		@ load input
    279	it	hi
    280	addhi	$h4,$h4,#1		@ padbit
    281	ldr	r1,[lr,#-12]
    282	ldr	r2,[lr,#-8]
    283	ldr	r3,[lr,#-4]
    284# ifdef	__ARMEB__
    285	rev	r0,r0
    286	rev	r1,r1
    287	rev	r2,r2
    288	rev	r3,r3
    289# endif
    290	adds	$h0,$h0,r0		@ accumulate input
    291	str	lr,[sp,#8]		@ offload input pointer
    292	adcs	$h1,$h1,r1
    293	add	$s1,$r1,$r1,lsr#2
    294	adcs	$h2,$h2,r2
    295#endif
    296	add	$s2,$r2,$r2,lsr#2
    297	adcs	$h3,$h3,r3
    298	add	$s3,$r3,$r3,lsr#2
    299
    300	umull	r2,r3,$h1,$r0
    301	 adc	$h4,$h4,#0
    302	umull	r0,r1,$h0,$r0
    303	umlal	r2,r3,$h4,$s1
    304	umlal	r0,r1,$h3,$s1
    305	ldr	$r1,[sp,#20]		@ reload $r1
    306	umlal	r2,r3,$h2,$s3
    307	umlal	r0,r1,$h1,$s3
    308	umlal	r2,r3,$h3,$s2
    309	umlal	r0,r1,$h2,$s2
    310	umlal	r2,r3,$h0,$r1
    311	str	r0,[sp,#0]		@ future $h0
    312	 mul	r0,$s2,$h4
    313	ldr	$r2,[sp,#24]		@ reload $r2
    314	adds	r2,r2,r1		@ d1+=d0>>32
    315	 eor	r1,r1,r1
    316	adc	lr,r3,#0		@ future $h2
    317	str	r2,[sp,#4]		@ future $h1
    318
    319	mul	r2,$s3,$h4
    320	eor	r3,r3,r3
    321	umlal	r0,r1,$h3,$s3
    322	ldr	$r3,[sp,#28]		@ reload $r3
    323	umlal	r2,r3,$h3,$r0
    324	umlal	r0,r1,$h2,$r0
    325	umlal	r2,r3,$h2,$r1
    326	umlal	r0,r1,$h1,$r1
    327	umlal	r2,r3,$h1,$r2
    328	umlal	r0,r1,$h0,$r2
    329	umlal	r2,r3,$h0,$r3
    330	ldr	$h0,[sp,#0]
    331	mul	$h4,$r0,$h4
    332	ldr	$h1,[sp,#4]
    333
    334	adds	$h2,lr,r0		@ d2+=d1>>32
    335	ldr	lr,[sp,#8]		@ reload input pointer
    336	adc	r1,r1,#0
    337	adds	$h3,r2,r1		@ d3+=d2>>32
    338	ldr	r0,[sp,#16]		@ reload end pointer
    339	adc	r3,r3,#0
    340	add	$h4,$h4,r3		@ h4+=d3>>32
    341
    342	and	r1,$h4,#-4
    343	and	$h4,$h4,#3
    344	add	r1,r1,r1,lsr#2		@ *=5
    345	adds	$h0,$h0,r1
    346	adcs	$h1,$h1,#0
    347	adcs	$h2,$h2,#0
    348	adcs	$h3,$h3,#0
    349	adc	$h4,$h4,#0
    350
    351	cmp	r0,lr			@ done yet?
    352	bhi	.Loop
    353
    354	ldr	$ctx,[sp,#12]
    355	add	sp,sp,#32
    356	stmdb	$ctx,{$h0-$h4}		@ store the result
    357
    358.Lno_data:
    359#if	__ARM_ARCH__>=5
    360	ldmia	sp!,{r3-r11,pc}
    361#else
    362	ldmia	sp!,{r3-r11,lr}
    363	tst	lr,#1
    364	moveq	pc,lr			@ be binary compatible with V4, yet
    365	bx	lr			@ interoperable with Thumb ISA:-)
    366#endif
    367.size	poly1305_blocks,.-poly1305_blocks
    368___
    369}
    370{
    371my ($ctx,$mac,$nonce)=map("r$_",(0..2));
    372my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
    373my $g4=$ctx;
    374
    375$code.=<<___;
    376.type	poly1305_emit,%function
    377.align	5
    378poly1305_emit:
    379.Lpoly1305_emit:
    380	stmdb	sp!,{r4-r11}
    381
    382	ldmia	$ctx,{$h0-$h4}
    383
    384#if __ARM_ARCH__>=7
    385	ldr	ip,[$ctx,#36]		@ is_base2_26
    386
    387	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
    388	mov	$g1,$h1,lsr#6
    389	adcs	$g1,$g1,$h2,lsl#20
    390	mov	$g2,$h2,lsr#12
    391	adcs	$g2,$g2,$h3,lsl#14
    392	mov	$g3,$h3,lsr#18
    393	adcs	$g3,$g3,$h4,lsl#8
    394	mov	$g4,#0
    395	adc	$g4,$g4,$h4,lsr#24
    396
    397	tst	ip,ip
    398	itttt	ne
    399	movne	$h0,$g0
    400	movne	$h1,$g1
    401	movne	$h2,$g2
    402	movne	$h3,$g3
    403	it	ne
    404	movne	$h4,$g4
    405#endif
    406
    407	adds	$g0,$h0,#5		@ compare to modulus
    408	adcs	$g1,$h1,#0
    409	adcs	$g2,$h2,#0
    410	adcs	$g3,$h3,#0
    411	adc	$g4,$h4,#0
    412	tst	$g4,#4			@ did it carry/borrow?
    413
    414#ifdef	__thumb2__
    415	it	ne
    416#endif
    417	movne	$h0,$g0
    418	ldr	$g0,[$nonce,#0]
    419#ifdef	__thumb2__
    420	it	ne
    421#endif
    422	movne	$h1,$g1
    423	ldr	$g1,[$nonce,#4]
    424#ifdef	__thumb2__
    425	it	ne
    426#endif
    427	movne	$h2,$g2
    428	ldr	$g2,[$nonce,#8]
    429#ifdef	__thumb2__
    430	it	ne
    431#endif
    432	movne	$h3,$g3
    433	ldr	$g3,[$nonce,#12]
    434
    435	adds	$h0,$h0,$g0
    436	adcs	$h1,$h1,$g1
    437	adcs	$h2,$h2,$g2
    438	adc	$h3,$h3,$g3
    439
    440#if __ARM_ARCH__>=7
    441# ifdef __ARMEB__
    442	rev	$h0,$h0
    443	rev	$h1,$h1
    444	rev	$h2,$h2
    445	rev	$h3,$h3
    446# endif
    447	str	$h0,[$mac,#0]
    448	str	$h1,[$mac,#4]
    449	str	$h2,[$mac,#8]
    450	str	$h3,[$mac,#12]
    451#else
    452	strb	$h0,[$mac,#0]
    453	mov	$h0,$h0,lsr#8
    454	strb	$h1,[$mac,#4]
    455	mov	$h1,$h1,lsr#8
    456	strb	$h2,[$mac,#8]
    457	mov	$h2,$h2,lsr#8
    458	strb	$h3,[$mac,#12]
    459	mov	$h3,$h3,lsr#8
    460
    461	strb	$h0,[$mac,#1]
    462	mov	$h0,$h0,lsr#8
    463	strb	$h1,[$mac,#5]
    464	mov	$h1,$h1,lsr#8
    465	strb	$h2,[$mac,#9]
    466	mov	$h2,$h2,lsr#8
    467	strb	$h3,[$mac,#13]
    468	mov	$h3,$h3,lsr#8
    469
    470	strb	$h0,[$mac,#2]
    471	mov	$h0,$h0,lsr#8
    472	strb	$h1,[$mac,#6]
    473	mov	$h1,$h1,lsr#8
    474	strb	$h2,[$mac,#10]
    475	mov	$h2,$h2,lsr#8
    476	strb	$h3,[$mac,#14]
    477	mov	$h3,$h3,lsr#8
    478
    479	strb	$h0,[$mac,#3]
    480	strb	$h1,[$mac,#7]
    481	strb	$h2,[$mac,#11]
    482	strb	$h3,[$mac,#15]
    483#endif
    484	ldmia	sp!,{r4-r11}
    485#if	__ARM_ARCH__>=5
    486	ret				@ bx	lr
    487#else
    488	tst	lr,#1
    489	moveq	pc,lr			@ be binary compatible with V4, yet
    490	bx	lr			@ interoperable with Thumb ISA:-)
    491#endif
    492.size	poly1305_emit,.-poly1305_emit
    493___
    494{
    495my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
    496my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
    497my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
    498
    499my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
    500
    501$code.=<<___;
    502#if	__ARM_MAX_ARCH__>=7
    503.fpu	neon
    504
    505.type	poly1305_init_neon,%function
    506.align	5
    507poly1305_init_neon:
    508.Lpoly1305_init_neon:
    509	ldr	r3,[$ctx,#48]		@ first table element
    510	cmp	r3,#-1			@ is value impossible?
    511	bne	.Lno_init_neon
    512
    513	ldr	r4,[$ctx,#20]		@ load key base 2^32
    514	ldr	r5,[$ctx,#24]
    515	ldr	r6,[$ctx,#28]
    516	ldr	r7,[$ctx,#32]
    517
    518	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
    519	mov	r3,r4,lsr#26
    520	mov	r4,r5,lsr#20
    521	orr	r3,r3,r5,lsl#6
    522	mov	r5,r6,lsr#14
    523	orr	r4,r4,r6,lsl#12
    524	mov	r6,r7,lsr#8
    525	orr	r5,r5,r7,lsl#18
    526	and	r3,r3,#0x03ffffff
    527	and	r4,r4,#0x03ffffff
    528	and	r5,r5,#0x03ffffff
    529
    530	vdup.32	$R0,r2			@ r^1 in both lanes
    531	add	r2,r3,r3,lsl#2		@ *5
    532	vdup.32	$R1,r3
    533	add	r3,r4,r4,lsl#2
    534	vdup.32	$S1,r2
    535	vdup.32	$R2,r4
    536	add	r4,r5,r5,lsl#2
    537	vdup.32	$S2,r3
    538	vdup.32	$R3,r5
    539	add	r5,r6,r6,lsl#2
    540	vdup.32	$S3,r4
    541	vdup.32	$R4,r6
    542	vdup.32	$S4,r5
    543
    544	mov	$zeros,#2		@ counter
    545
    546.Lsquare_neon:
    547	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    548	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
    549	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
    550	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
    551	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
    552	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
    553
    554	vmull.u32	$D0,$R0,${R0}[1]
    555	vmull.u32	$D1,$R1,${R0}[1]
    556	vmull.u32	$D2,$R2,${R0}[1]
    557	vmull.u32	$D3,$R3,${R0}[1]
    558	vmull.u32	$D4,$R4,${R0}[1]
    559
    560	vmlal.u32	$D0,$R4,${S1}[1]
    561	vmlal.u32	$D1,$R0,${R1}[1]
    562	vmlal.u32	$D2,$R1,${R1}[1]
    563	vmlal.u32	$D3,$R2,${R1}[1]
    564	vmlal.u32	$D4,$R3,${R1}[1]
    565
    566	vmlal.u32	$D0,$R3,${S2}[1]
    567	vmlal.u32	$D1,$R4,${S2}[1]
    568	vmlal.u32	$D3,$R1,${R2}[1]
    569	vmlal.u32	$D2,$R0,${R2}[1]
    570	vmlal.u32	$D4,$R2,${R2}[1]
    571
    572	vmlal.u32	$D0,$R2,${S3}[1]
    573	vmlal.u32	$D3,$R0,${R3}[1]
    574	vmlal.u32	$D1,$R3,${S3}[1]
    575	vmlal.u32	$D2,$R4,${S3}[1]
    576	vmlal.u32	$D4,$R1,${R3}[1]
    577
    578	vmlal.u32	$D3,$R4,${S4}[1]
    579	vmlal.u32	$D0,$R1,${S4}[1]
    580	vmlal.u32	$D1,$R2,${S4}[1]
    581	vmlal.u32	$D2,$R3,${S4}[1]
    582	vmlal.u32	$D4,$R0,${R4}[1]
    583
    584	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    585	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
    586	@ and P. Schwabe
    587	@
    588	@ H0>>+H1>>+H2>>+H3>>+H4
    589	@ H3>>+H4>>*5+H0>>+H1
    590	@
    591	@ Trivia.
    592	@
    593	@ Result of multiplication of n-bit number by m-bit number is
    594	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
    595	@ m-bit number multiplied by 2^n is still n+m bits wide.
    596	@
    597	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
    598	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
    599	@ one is n+1 bits wide.
    600	@
    601	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
    602	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
    603	@ can be 27. However! In cases when their width exceeds 26 bits
    604	@ they are limited by 2^26+2^6. This in turn means that *sum*
    605	@ of the products with these values can still be viewed as sum
    606	@ of 52-bit numbers as long as the amount of addends is not a
    607	@ power of 2. For example,
    608	@
    609	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
    610	@
    611	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
    612	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
    613	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
    614	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
    615	@ which is less than 32 * (2^52) or 2^57. And when processing
    616	@ data we are looking at triple as many addends...
    617	@
    618	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
    619	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
    620	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
    621	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
    622	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
    623	@ This means that result of reduction have to be compressed upon
    624	@ loop wrap-around. This can be done in the process of reduction
    625	@ to minimize amount of instructions [as well as amount of
    626	@ 128-bit instructions, which benefits low-end processors], but
    627	@ one has to watch for H2 (which is narrower than H0) and 5*H4
    628	@ not being wider than 58 bits, so that result of right shift
    629	@ by 26 bits fits in 32 bits. This is also useful on x86,
    630	@ because it allows to use paddd in place for paddq, which
    631	@ benefits Atom, where paddq is ridiculously slow.
    632
    633	vshr.u64	$T0,$D3,#26
    634	vmovn.i64	$D3#lo,$D3
    635	 vshr.u64	$T1,$D0,#26
    636	 vmovn.i64	$D0#lo,$D0
    637	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
    638	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
    639	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
    640	 vbic.i32	$D0#lo,#0xfc000000
    641
    642	vshrn.u64	$T0#lo,$D4,#26
    643	vmovn.i64	$D4#lo,$D4
    644	 vshr.u64	$T1,$D1,#26
    645	 vmovn.i64	$D1#lo,$D1
    646	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
    647	vbic.i32	$D4#lo,#0xfc000000
    648	 vbic.i32	$D1#lo,#0xfc000000
    649
    650	vadd.i32	$D0#lo,$D0#lo,$T0#lo
    651	vshl.u32	$T0#lo,$T0#lo,#2
    652	 vshrn.u64	$T1#lo,$D2,#26
    653	 vmovn.i64	$D2#lo,$D2
    654	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
    655	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
    656	 vbic.i32	$D2#lo,#0xfc000000
    657
    658	vshr.u32	$T0#lo,$D0#lo,#26
    659	vbic.i32	$D0#lo,#0xfc000000
    660	 vshr.u32	$T1#lo,$D3#lo,#26
    661	 vbic.i32	$D3#lo,#0xfc000000
    662	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
    663	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
    664
    665	subs		$zeros,$zeros,#1
    666	beq		.Lsquare_break_neon
    667
    668	add		$tbl0,$ctx,#(48+0*9*4)
    669	add		$tbl1,$ctx,#(48+1*9*4)
    670
    671	vtrn.32		$R0,$D0#lo		@ r^2:r^1
    672	vtrn.32		$R2,$D2#lo
    673	vtrn.32		$R3,$D3#lo
    674	vtrn.32		$R1,$D1#lo
    675	vtrn.32		$R4,$D4#lo
    676
    677	vshl.u32	$S2,$R2,#2		@ *5
    678	vshl.u32	$S3,$R3,#2
    679	vshl.u32	$S1,$R1,#2
    680	vshl.u32	$S4,$R4,#2
    681	vadd.i32	$S2,$S2,$R2
    682	vadd.i32	$S1,$S1,$R1
    683	vadd.i32	$S3,$S3,$R3
    684	vadd.i32	$S4,$S4,$R4
    685
    686	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
    687	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
    688	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
    689	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
    690	vst1.32		{${S4}[0]},[$tbl0,:32]
    691	vst1.32		{${S4}[1]},[$tbl1,:32]
    692
    693	b		.Lsquare_neon
    694
    695.align	4
    696.Lsquare_break_neon:
    697	add		$tbl0,$ctx,#(48+2*4*9)
    698	add		$tbl1,$ctx,#(48+3*4*9)
    699
    700	vmov		$R0,$D0#lo		@ r^4:r^3
    701	vshl.u32	$S1,$D1#lo,#2		@ *5
    702	vmov		$R1,$D1#lo
    703	vshl.u32	$S2,$D2#lo,#2
    704	vmov		$R2,$D2#lo
    705	vshl.u32	$S3,$D3#lo,#2
    706	vmov		$R3,$D3#lo
    707	vshl.u32	$S4,$D4#lo,#2
    708	vmov		$R4,$D4#lo
    709	vadd.i32	$S1,$S1,$D1#lo
    710	vadd.i32	$S2,$S2,$D2#lo
    711	vadd.i32	$S3,$S3,$D3#lo
    712	vadd.i32	$S4,$S4,$D4#lo
    713
    714	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
    715	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
    716	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
    717	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
    718	vst1.32		{${S4}[0]},[$tbl0]
    719	vst1.32		{${S4}[1]},[$tbl1]
    720
    721.Lno_init_neon:
    722	ret				@ bx	lr
    723.size	poly1305_init_neon,.-poly1305_init_neon
    724
    725.type	poly1305_blocks_neon,%function
    726.align	5
    727poly1305_blocks_neon:
    728.Lpoly1305_blocks_neon:
    729	ldr	ip,[$ctx,#36]		@ is_base2_26
    730
    731	cmp	$len,#64
    732	blo	.Lpoly1305_blocks
    733
    734	stmdb	sp!,{r4-r7}
    735	vstmdb	sp!,{d8-d15}		@ ABI specification says so
    736
    737	tst	ip,ip			@ is_base2_26?
    738	bne	.Lbase2_26_neon
    739
    740	stmdb	sp!,{r1-r3,lr}
    741	bl	.Lpoly1305_init_neon
    742
    743	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
    744	ldr	r5,[$ctx,#4]
    745	ldr	r6,[$ctx,#8]
    746	ldr	r7,[$ctx,#12]
    747	ldr	ip,[$ctx,#16]
    748
    749	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
    750	mov	r3,r4,lsr#26
    751	 veor	$D0#lo,$D0#lo,$D0#lo
    752	mov	r4,r5,lsr#20
    753	orr	r3,r3,r5,lsl#6
    754	 veor	$D1#lo,$D1#lo,$D1#lo
    755	mov	r5,r6,lsr#14
    756	orr	r4,r4,r6,lsl#12
    757	 veor	$D2#lo,$D2#lo,$D2#lo
    758	mov	r6,r7,lsr#8
    759	orr	r5,r5,r7,lsl#18
    760	 veor	$D3#lo,$D3#lo,$D3#lo
    761	and	r3,r3,#0x03ffffff
    762	orr	r6,r6,ip,lsl#24
    763	 veor	$D4#lo,$D4#lo,$D4#lo
    764	and	r4,r4,#0x03ffffff
    765	mov	r1,#1
    766	and	r5,r5,#0x03ffffff
    767	str	r1,[$ctx,#36]		@ set is_base2_26
    768
    769	vmov.32	$D0#lo[0],r2
    770	vmov.32	$D1#lo[0],r3
    771	vmov.32	$D2#lo[0],r4
    772	vmov.32	$D3#lo[0],r5
    773	vmov.32	$D4#lo[0],r6
    774	adr	$zeros,.Lzeros
    775
    776	ldmia	sp!,{r1-r3,lr}
    777	b	.Lhash_loaded
    778
    779.align	4
    780.Lbase2_26_neon:
    781	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    782	@ load hash value
    783
    784	veor		$D0#lo,$D0#lo,$D0#lo
    785	veor		$D1#lo,$D1#lo,$D1#lo
    786	veor		$D2#lo,$D2#lo,$D2#lo
    787	veor		$D3#lo,$D3#lo,$D3#lo
    788	veor		$D4#lo,$D4#lo,$D4#lo
    789	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
    790	adr		$zeros,.Lzeros
    791	vld1.32		{$D4#lo[0]},[$ctx]
    792	sub		$ctx,$ctx,#16		@ rewind
    793
    794.Lhash_loaded:
    795	add		$in2,$inp,#32
    796	mov		$padbit,$padbit,lsl#24
    797	tst		$len,#31
    798	beq		.Leven
    799
    800	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
    801	vmov.32		$H4#lo[0],$padbit
    802	sub		$len,$len,#16
    803	add		$in2,$inp,#32
    804
    805# ifdef	__ARMEB__
    806	vrev32.8	$H0,$H0
    807	vrev32.8	$H3,$H3
    808	vrev32.8	$H1,$H1
    809	vrev32.8	$H2,$H2
    810# endif
    811	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
    812	vshl.u32	$H3#lo,$H3#lo,#18
    813
    814	vsri.u32	$H3#lo,$H2#lo,#14
    815	vshl.u32	$H2#lo,$H2#lo,#12
    816	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
    817
    818	vbic.i32	$H3#lo,#0xfc000000
    819	vsri.u32	$H2#lo,$H1#lo,#20
    820	vshl.u32	$H1#lo,$H1#lo,#6
    821
    822	vbic.i32	$H2#lo,#0xfc000000
    823	vsri.u32	$H1#lo,$H0#lo,#26
    824	vadd.i32	$H3#hi,$H3#lo,$D3#lo
    825
    826	vbic.i32	$H0#lo,#0xfc000000
    827	vbic.i32	$H1#lo,#0xfc000000
    828	vadd.i32	$H2#hi,$H2#lo,$D2#lo
    829
    830	vadd.i32	$H0#hi,$H0#lo,$D0#lo
    831	vadd.i32	$H1#hi,$H1#lo,$D1#lo
    832
    833	mov		$tbl1,$zeros
    834	add		$tbl0,$ctx,#48
    835
    836	cmp		$len,$len
    837	b		.Long_tail
    838
    839.align	4
    840.Leven:
    841	subs		$len,$len,#64
    842	it		lo
    843	movlo		$in2,$zeros
    844
    845	vmov.i32	$H4,#1<<24		@ padbit, yes, always
    846	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
    847	add		$inp,$inp,#64
    848	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
    849	add		$in2,$in2,#64
    850	itt		hi
    851	addhi		$tbl1,$ctx,#(48+1*9*4)
    852	addhi		$tbl0,$ctx,#(48+3*9*4)
    853
    854# ifdef	__ARMEB__
    855	vrev32.8	$H0,$H0
    856	vrev32.8	$H3,$H3
    857	vrev32.8	$H1,$H1
    858	vrev32.8	$H2,$H2
    859# endif
    860	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
    861	vshl.u32	$H3,$H3,#18
    862
    863	vsri.u32	$H3,$H2,#14
    864	vshl.u32	$H2,$H2,#12
    865
    866	vbic.i32	$H3,#0xfc000000
    867	vsri.u32	$H2,$H1,#20
    868	vshl.u32	$H1,$H1,#6
    869
    870	vbic.i32	$H2,#0xfc000000
    871	vsri.u32	$H1,$H0,#26
    872
    873	vbic.i32	$H0,#0xfc000000
    874	vbic.i32	$H1,#0xfc000000
    875
    876	bls		.Lskip_loop
    877
    878	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
    879	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
    880	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
    881	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
    882	b		.Loop_neon
    883
    884.align	5
    885.Loop_neon:
    886	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    887	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
    888	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
    889	@   \___________________/
    890	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
    891	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
    892	@   \___________________/ \____________________/
    893	@
    894	@ Note that we start with inp[2:3]*r^2. This is because it
    895	@ doesn't depend on reduction in previous iteration.
    896	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    897	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
    898	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
    899	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
    900	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
    901	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
    902
    903	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    904	@ inp[2:3]*r^2
    905
    906	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
    907	vmull.u32	$D2,$H2#hi,${R0}[1]
    908	vadd.i32	$H0#lo,$H0#lo,$D0#lo
    909	vmull.u32	$D0,$H0#hi,${R0}[1]
    910	vadd.i32	$H3#lo,$H3#lo,$D3#lo
    911	vmull.u32	$D3,$H3#hi,${R0}[1]
    912	vmlal.u32	$D2,$H1#hi,${R1}[1]
    913	vadd.i32	$H1#lo,$H1#lo,$D1#lo
    914	vmull.u32	$D1,$H1#hi,${R0}[1]
    915
    916	vadd.i32	$H4#lo,$H4#lo,$D4#lo
    917	vmull.u32	$D4,$H4#hi,${R0}[1]
    918	subs		$len,$len,#64
    919	vmlal.u32	$D0,$H4#hi,${S1}[1]
    920	it		lo
    921	movlo		$in2,$zeros
    922	vmlal.u32	$D3,$H2#hi,${R1}[1]
    923	vld1.32		${S4}[1],[$tbl1,:32]
    924	vmlal.u32	$D1,$H0#hi,${R1}[1]
    925	vmlal.u32	$D4,$H3#hi,${R1}[1]
    926
    927	vmlal.u32	$D0,$H3#hi,${S2}[1]
    928	vmlal.u32	$D3,$H1#hi,${R2}[1]
    929	vmlal.u32	$D4,$H2#hi,${R2}[1]
    930	vmlal.u32	$D1,$H4#hi,${S2}[1]
    931	vmlal.u32	$D2,$H0#hi,${R2}[1]
    932
    933	vmlal.u32	$D3,$H0#hi,${R3}[1]
    934	vmlal.u32	$D0,$H2#hi,${S3}[1]
    935	vmlal.u32	$D4,$H1#hi,${R3}[1]
    936	vmlal.u32	$D1,$H3#hi,${S3}[1]
    937	vmlal.u32	$D2,$H4#hi,${S3}[1]
    938
    939	vmlal.u32	$D3,$H4#hi,${S4}[1]
    940	vmlal.u32	$D0,$H1#hi,${S4}[1]
    941	vmlal.u32	$D4,$H0#hi,${R4}[1]
    942	vmlal.u32	$D1,$H2#hi,${S4}[1]
    943	vmlal.u32	$D2,$H3#hi,${S4}[1]
    944
    945	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
    946	add		$in2,$in2,#64
    947
    948	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    949	@ (hash+inp[0:1])*r^4 and accumulate
    950
    951	vmlal.u32	$D3,$H3#lo,${R0}[0]
    952	vmlal.u32	$D0,$H0#lo,${R0}[0]
    953	vmlal.u32	$D4,$H4#lo,${R0}[0]
    954	vmlal.u32	$D1,$H1#lo,${R0}[0]
    955	vmlal.u32	$D2,$H2#lo,${R0}[0]
    956	vld1.32		${S4}[0],[$tbl0,:32]
    957
    958	vmlal.u32	$D3,$H2#lo,${R1}[0]
    959	vmlal.u32	$D0,$H4#lo,${S1}[0]
    960	vmlal.u32	$D4,$H3#lo,${R1}[0]
    961	vmlal.u32	$D1,$H0#lo,${R1}[0]
    962	vmlal.u32	$D2,$H1#lo,${R1}[0]
    963
    964	vmlal.u32	$D3,$H1#lo,${R2}[0]
    965	vmlal.u32	$D0,$H3#lo,${S2}[0]
    966	vmlal.u32	$D4,$H2#lo,${R2}[0]
    967	vmlal.u32	$D1,$H4#lo,${S2}[0]
    968	vmlal.u32	$D2,$H0#lo,${R2}[0]
    969
    970	vmlal.u32	$D3,$H0#lo,${R3}[0]
    971	vmlal.u32	$D0,$H2#lo,${S3}[0]
    972	vmlal.u32	$D4,$H1#lo,${R3}[0]
    973	vmlal.u32	$D1,$H3#lo,${S3}[0]
    974	vmlal.u32	$D3,$H4#lo,${S4}[0]
    975
    976	vmlal.u32	$D2,$H4#lo,${S3}[0]
    977	vmlal.u32	$D0,$H1#lo,${S4}[0]
    978	vmlal.u32	$D4,$H0#lo,${R4}[0]
    979	vmov.i32	$H4,#1<<24		@ padbit, yes, always
    980	vmlal.u32	$D1,$H2#lo,${S4}[0]
    981	vmlal.u32	$D2,$H3#lo,${S4}[0]
    982
    983	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
    984	add		$inp,$inp,#64
    985# ifdef	__ARMEB__
    986	vrev32.8	$H0,$H0
    987	vrev32.8	$H1,$H1
    988	vrev32.8	$H2,$H2
    989	vrev32.8	$H3,$H3
    990# endif
    991
    992	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    993	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
    994	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
    995
    996	vshr.u64	$T0,$D3,#26
    997	vmovn.i64	$D3#lo,$D3
    998	 vshr.u64	$T1,$D0,#26
    999	 vmovn.i64	$D0#lo,$D0
   1000	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
   1001	vbic.i32	$D3#lo,#0xfc000000
   1002	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
   1003	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
   1004	  vshl.u32	$H3,$H3,#18
   1005	 vbic.i32	$D0#lo,#0xfc000000
   1006
   1007	vshrn.u64	$T0#lo,$D4,#26
   1008	vmovn.i64	$D4#lo,$D4
   1009	 vshr.u64	$T1,$D1,#26
   1010	 vmovn.i64	$D1#lo,$D1
   1011	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
   1012	  vsri.u32	$H3,$H2,#14
   1013	vbic.i32	$D4#lo,#0xfc000000
   1014	  vshl.u32	$H2,$H2,#12
   1015	 vbic.i32	$D1#lo,#0xfc000000
   1016
   1017	vadd.i32	$D0#lo,$D0#lo,$T0#lo
   1018	vshl.u32	$T0#lo,$T0#lo,#2
   1019	  vbic.i32	$H3,#0xfc000000
   1020	 vshrn.u64	$T1#lo,$D2,#26
   1021	 vmovn.i64	$D2#lo,$D2
   1022	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
   1023	  vsri.u32	$H2,$H1,#20
   1024	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
   1025	  vshl.u32	$H1,$H1,#6
   1026	 vbic.i32	$D2#lo,#0xfc000000
   1027	  vbic.i32	$H2,#0xfc000000
   1028
   1029	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
   1030	vmovn.i64	$D0#lo,$D0
   1031	  vsri.u32	$H1,$H0,#26
   1032	  vbic.i32	$H0,#0xfc000000
   1033	 vshr.u32	$T1#lo,$D3#lo,#26
   1034	 vbic.i32	$D3#lo,#0xfc000000
   1035	vbic.i32	$D0#lo,#0xfc000000
   1036	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
   1037	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
   1038	  vbic.i32	$H1,#0xfc000000
   1039
   1040	bhi		.Loop_neon
   1041
   1042.Lskip_loop:
   1043	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
   1044	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
   1045
   1046	add		$tbl1,$ctx,#(48+0*9*4)
   1047	add		$tbl0,$ctx,#(48+1*9*4)
   1048	adds		$len,$len,#32
   1049	it		ne
   1050	movne		$len,#0
   1051	bne		.Long_tail
   1052
   1053	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
   1054	vadd.i32	$H0#hi,$H0#lo,$D0#lo
   1055	vadd.i32	$H3#hi,$H3#lo,$D3#lo
   1056	vadd.i32	$H1#hi,$H1#lo,$D1#lo
   1057	vadd.i32	$H4#hi,$H4#lo,$D4#lo
   1058
   1059.Long_tail:
   1060	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
   1061	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
   1062
   1063	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
   1064	vmull.u32	$D2,$H2#hi,$R0
   1065	vadd.i32	$H0#lo,$H0#lo,$D0#lo
   1066	vmull.u32	$D0,$H0#hi,$R0
   1067	vadd.i32	$H3#lo,$H3#lo,$D3#lo
   1068	vmull.u32	$D3,$H3#hi,$R0
   1069	vadd.i32	$H1#lo,$H1#lo,$D1#lo
   1070	vmull.u32	$D1,$H1#hi,$R0
   1071	vadd.i32	$H4#lo,$H4#lo,$D4#lo
   1072	vmull.u32	$D4,$H4#hi,$R0
   1073
   1074	vmlal.u32	$D0,$H4#hi,$S1
   1075	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
   1076	vmlal.u32	$D3,$H2#hi,$R1
   1077	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
   1078	vmlal.u32	$D1,$H0#hi,$R1
   1079	vmlal.u32	$D4,$H3#hi,$R1
   1080	vmlal.u32	$D2,$H1#hi,$R1
   1081
   1082	vmlal.u32	$D3,$H1#hi,$R2
   1083	vld1.32		${S4}[1],[$tbl1,:32]
   1084	vmlal.u32	$D0,$H3#hi,$S2
   1085	vld1.32		${S4}[0],[$tbl0,:32]
   1086	vmlal.u32	$D4,$H2#hi,$R2
   1087	vmlal.u32	$D1,$H4#hi,$S2
   1088	vmlal.u32	$D2,$H0#hi,$R2
   1089
   1090	vmlal.u32	$D3,$H0#hi,$R3
   1091	 it		ne
   1092	 addne		$tbl1,$ctx,#(48+2*9*4)
   1093	vmlal.u32	$D0,$H2#hi,$S3
   1094	 it		ne
   1095	 addne		$tbl0,$ctx,#(48+3*9*4)
   1096	vmlal.u32	$D4,$H1#hi,$R3
   1097	vmlal.u32	$D1,$H3#hi,$S3
   1098	vmlal.u32	$D2,$H4#hi,$S3
   1099
   1100	vmlal.u32	$D3,$H4#hi,$S4
   1101	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
   1102	vmlal.u32	$D0,$H1#hi,$S4
   1103	 vshr.u64	$MASK,$MASK,#38
   1104	vmlal.u32	$D4,$H0#hi,$R4
   1105	vmlal.u32	$D1,$H2#hi,$S4
   1106	vmlal.u32	$D2,$H3#hi,$S4
   1107
   1108	beq		.Lshort_tail
   1109
   1110	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
   1111	@ (hash+inp[0:1])*r^4:r^3 and accumulate
   1112
   1113	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
   1114	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
   1115
   1116	vmlal.u32	$D2,$H2#lo,$R0
   1117	vmlal.u32	$D0,$H0#lo,$R0
   1118	vmlal.u32	$D3,$H3#lo,$R0
   1119	vmlal.u32	$D1,$H1#lo,$R0
   1120	vmlal.u32	$D4,$H4#lo,$R0
   1121
   1122	vmlal.u32	$D0,$H4#lo,$S1
   1123	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
   1124	vmlal.u32	$D3,$H2#lo,$R1
   1125	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
   1126	vmlal.u32	$D1,$H0#lo,$R1
   1127	vmlal.u32	$D4,$H3#lo,$R1
   1128	vmlal.u32	$D2,$H1#lo,$R1
   1129
   1130	vmlal.u32	$D3,$H1#lo,$R2
   1131	vld1.32		${S4}[1],[$tbl1,:32]
   1132	vmlal.u32	$D0,$H3#lo,$S2
   1133	vld1.32		${S4}[0],[$tbl0,:32]
   1134	vmlal.u32	$D4,$H2#lo,$R2
   1135	vmlal.u32	$D1,$H4#lo,$S2
   1136	vmlal.u32	$D2,$H0#lo,$R2
   1137
   1138	vmlal.u32	$D3,$H0#lo,$R3
   1139	vmlal.u32	$D0,$H2#lo,$S3
   1140	vmlal.u32	$D4,$H1#lo,$R3
   1141	vmlal.u32	$D1,$H3#lo,$S3
   1142	vmlal.u32	$D2,$H4#lo,$S3
   1143
   1144	vmlal.u32	$D3,$H4#lo,$S4
   1145	 vorn		$MASK,$MASK,$MASK	@ all-ones
   1146	vmlal.u32	$D0,$H1#lo,$S4
   1147	 vshr.u64	$MASK,$MASK,#38
   1148	vmlal.u32	$D4,$H0#lo,$R4
   1149	vmlal.u32	$D1,$H2#lo,$S4
   1150	vmlal.u32	$D2,$H3#lo,$S4
   1151
   1152.Lshort_tail:
   1153	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
   1154	@ horizontal addition
   1155
   1156	vadd.i64	$D3#lo,$D3#lo,$D3#hi
   1157	vadd.i64	$D0#lo,$D0#lo,$D0#hi
   1158	vadd.i64	$D4#lo,$D4#lo,$D4#hi
   1159	vadd.i64	$D1#lo,$D1#lo,$D1#hi
   1160	vadd.i64	$D2#lo,$D2#lo,$D2#hi
   1161
   1162	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
   1163	@ lazy reduction, but without narrowing
   1164
   1165	vshr.u64	$T0,$D3,#26
   1166	vand.i64	$D3,$D3,$MASK
   1167	 vshr.u64	$T1,$D0,#26
   1168	 vand.i64	$D0,$D0,$MASK
   1169	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
   1170	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
   1171
   1172	vshr.u64	$T0,$D4,#26
   1173	vand.i64	$D4,$D4,$MASK
   1174	 vshr.u64	$T1,$D1,#26
   1175	 vand.i64	$D1,$D1,$MASK
   1176	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
   1177
   1178	vadd.i64	$D0,$D0,$T0
   1179	vshl.u64	$T0,$T0,#2
   1180	 vshr.u64	$T1,$D2,#26
   1181	 vand.i64	$D2,$D2,$MASK
   1182	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
   1183	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
   1184
   1185	vshr.u64	$T0,$D0,#26
   1186	vand.i64	$D0,$D0,$MASK
   1187	 vshr.u64	$T1,$D3,#26
   1188	 vand.i64	$D3,$D3,$MASK
   1189	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
   1190	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
   1191
   1192	cmp		$len,#0
   1193	bne		.Leven
   1194
   1195	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
   1196	@ store hash value
   1197
   1198	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
   1199	vst1.32		{$D4#lo[0]},[$ctx]
   1200
   1201	vldmia	sp!,{d8-d15}			@ epilogue
   1202	ldmia	sp!,{r4-r7}
   1203	ret					@ bx	lr
   1204.size	poly1305_blocks_neon,.-poly1305_blocks_neon
   1205
   1206.align	5
   1207.Lzeros:
   1208.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
   1209#ifndef	__KERNEL__
   1210.LOPENSSL_armcap:
   1211# ifdef	_WIN32
   1212.word	OPENSSL_armcap_P
   1213# else
   1214.word	OPENSSL_armcap_P-.Lpoly1305_init
   1215# endif
   1216.comm	OPENSSL_armcap_P,4,4
   1217.hidden	OPENSSL_armcap_P
   1218#endif
   1219#endif
   1220___
   1221}	}
   1222$code.=<<___;
   1223.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
   1224.align	2
   1225___
   1226
   1227foreach (split("\n",$code)) {
   1228	s/\`([^\`]*)\`/eval $1/geo;
   1229
   1230	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
   1231	s/\bret\b/bx	lr/go						or
   1232	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
   1233
   1234	print $_,"\n";
   1235}
   1236close STDOUT; # enforce flush