cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

poly1305-mips.pl (24537B)


      1#!/usr/bin/env perl
      2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
      3#
      4# ====================================================================
      5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
      6# project.
      7# ====================================================================
      8
      9# Poly1305 hash for MIPS.
     10#
     11# May 2016
     12#
     13# Numbers are cycles per processed byte with poly1305_blocks alone.
     14#
     15#		IALU/gcc
     16# R1x000	~5.5/+130%	(big-endian)
     17# Octeon II	2.50/+70%	(little-endian)
     18#
     19# March 2019
     20#
     21# Add 32-bit code path.
     22#
     23# October 2019
     24#
     25# Modulo-scheduling reduction allows to omit dependency chain at the
     26# end of inner loop and improve performance. Also optimize MIPS32R2
     27# code path for MIPS 1004K core. Per René von Dorst's suggestions.
     28#
     29#		IALU/gcc
     30# R1x000	~9.8/?		(big-endian)
     31# Octeon II	3.65/+140%	(little-endian)
     32# MT7621/1004K	4.75/?		(little-endian)
     33#
     34######################################################################
     35# There is a number of MIPS ABI in use, O32 and N32/64 are most
     36# widely used. Then there is a new contender: NUBI. It appears that if
     37# one picks the latter, it's possible to arrange code in ABI neutral
     38# manner. Therefore let's stick to NUBI register layout:
     39#
     40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
     41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
     42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
     43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
     44#
     45# The return value is placed in $a0. Following coding rules facilitate
     46# interoperability:
     47#
     48# - never ever touch $tp, "thread pointer", former $gp [o32 can be
     49#   excluded from the rule, because it's specified volatile];
     50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
     51#   old code];
     52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
     53#
     54# For reference here is register layout for N32/64 MIPS ABIs:
     55#
     56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
     57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
     58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
     59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
     60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
     61#
     62# <appro@openssl.org>
     63#
     64######################################################################
     65
     66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
     67
     68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
     69
     70if ($flavour =~ /64|n32/i) {{{
     71######################################################################
     72# 64-bit code path
     73#
     74
     75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
     76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
     77
     78$code.=<<___;
     79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
     80     defined(_MIPS_ARCH_MIPS64R6)) \\
     81     && !defined(_MIPS_ARCH_MIPS64R2)
     82# define _MIPS_ARCH_MIPS64R2
     83#endif
     84
     85#if defined(_MIPS_ARCH_MIPS64R6)
     86# define dmultu(rs,rt)
     87# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
     88# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
     89#else
     90# define dmultu(rs,rt)		dmultu	rs,rt
     91# define mflo(rd,rs,rt)	mflo	rd
     92# define mfhi(rd,rs,rt)	mfhi	rd
     93#endif
     94
     95#ifdef	__KERNEL__
     96# define poly1305_init   poly1305_init_mips
     97# define poly1305_blocks poly1305_blocks_mips
     98# define poly1305_emit   poly1305_emit_mips
     99#endif
    100
    101#if defined(__MIPSEB__) && !defined(MIPSEB)
    102# define MIPSEB
    103#endif
    104
    105#ifdef MIPSEB
    106# define MSB 0
    107# define LSB 7
    108#else
    109# define MSB 7
    110# define LSB 0
    111#endif
    112
    113.text
    114.set	noat
    115.set	noreorder
    116
    117.align	5
    118.globl	poly1305_init
    119.ent	poly1305_init
    120poly1305_init:
    121	.frame	$sp,0,$ra
    122	.set	reorder
    123
    124	sd	$zero,0($ctx)
    125	sd	$zero,8($ctx)
    126	sd	$zero,16($ctx)
    127
    128	beqz	$inp,.Lno_key
    129
    130#if defined(_MIPS_ARCH_MIPS64R6)
    131	andi	$tmp0,$inp,7		# $inp % 8
    132	dsubu	$inp,$inp,$tmp0		# align $inp
    133	sll	$tmp0,$tmp0,3		# byte to bit offset
    134	ld	$in0,0($inp)
    135	ld	$in1,8($inp)
    136	beqz	$tmp0,.Laligned_key
    137	ld	$tmp2,16($inp)
    138
    139	subu	$tmp1,$zero,$tmp0
    140# ifdef	MIPSEB
    141	dsllv	$in0,$in0,$tmp0
    142	dsrlv	$tmp3,$in1,$tmp1
    143	dsllv	$in1,$in1,$tmp0
    144	dsrlv	$tmp2,$tmp2,$tmp1
    145# else
    146	dsrlv	$in0,$in0,$tmp0
    147	dsllv	$tmp3,$in1,$tmp1
    148	dsrlv	$in1,$in1,$tmp0
    149	dsllv	$tmp2,$tmp2,$tmp1
    150# endif
    151	or	$in0,$in0,$tmp3
    152	or	$in1,$in1,$tmp2
    153.Laligned_key:
    154#else
    155	ldl	$in0,0+MSB($inp)
    156	ldl	$in1,8+MSB($inp)
    157	ldr	$in0,0+LSB($inp)
    158	ldr	$in1,8+LSB($inp)
    159#endif
    160#ifdef	MIPSEB
    161# if defined(_MIPS_ARCH_MIPS64R2)
    162	dsbh	$in0,$in0		# byte swap
    163	 dsbh	$in1,$in1
    164	dshd	$in0,$in0
    165	 dshd	$in1,$in1
    166# else
    167	ori	$tmp0,$zero,0xFF
    168	dsll	$tmp2,$tmp0,32
    169	or	$tmp0,$tmp2		# 0x000000FF000000FF
    170
    171	and	$tmp1,$in0,$tmp0	# byte swap
    172	 and	$tmp3,$in1,$tmp0
    173	dsrl	$tmp2,$in0,24
    174	 dsrl	$tmp4,$in1,24
    175	dsll	$tmp1,24
    176	 dsll	$tmp3,24
    177	and	$tmp2,$tmp0
    178	 and	$tmp4,$tmp0
    179	dsll	$tmp0,8			# 0x0000FF000000FF00
    180	or	$tmp1,$tmp2
    181	 or	$tmp3,$tmp4
    182	and	$tmp2,$in0,$tmp0
    183	 and	$tmp4,$in1,$tmp0
    184	dsrl	$in0,8
    185	 dsrl	$in1,8
    186	dsll	$tmp2,8
    187	 dsll	$tmp4,8
    188	and	$in0,$tmp0
    189	 and	$in1,$tmp0
    190	or	$tmp1,$tmp2
    191	 or	$tmp3,$tmp4
    192	or	$in0,$tmp1
    193	 or	$in1,$tmp3
    194	dsrl	$tmp1,$in0,32
    195	 dsrl	$tmp3,$in1,32
    196	dsll	$in0,32
    197	 dsll	$in1,32
    198	or	$in0,$tmp1
    199	 or	$in1,$tmp3
    200# endif
    201#endif
    202	li	$tmp0,1
    203	dsll	$tmp0,32		# 0x0000000100000000
    204	daddiu	$tmp0,-63		# 0x00000000ffffffc1
    205	dsll	$tmp0,28		# 0x0ffffffc10000000
    206	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
    207
    208	and	$in0,$tmp0
    209	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
    210	and	$in1,$tmp0
    211
    212	sd	$in0,24($ctx)
    213	dsrl	$tmp0,$in1,2
    214	sd	$in1,32($ctx)
    215	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
    216	sd	$tmp0,40($ctx)
    217
    218.Lno_key:
    219	li	$v0,0			# return 0
    220	jr	$ra
    221.end	poly1305_init
    222___
    223{
    224my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
    225
    226my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
    227   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
    228my ($shr,$shl) = ($s6,$s7);		# used on R6
    229
    230$code.=<<___;
    231.align	5
    232.globl	poly1305_blocks
    233.ent	poly1305_blocks
    234poly1305_blocks:
    235	.set	noreorder
    236	dsrl	$len,4			# number of complete blocks
    237	bnez	$len,poly1305_blocks_internal
    238	nop
    239	jr	$ra
    240	nop
    241.end	poly1305_blocks
    242
    243.align	5
    244.ent	poly1305_blocks_internal
    245poly1305_blocks_internal:
    246	.set	noreorder
    247#if defined(_MIPS_ARCH_MIPS64R6)
    248	.frame	$sp,8*8,$ra
    249	.mask	$SAVED_REGS_MASK|0x000c0000,-8
    250	dsubu	$sp,8*8
    251	sd	$s7,56($sp)
    252	sd	$s6,48($sp)
    253#else
    254	.frame	$sp,6*8,$ra
    255	.mask	$SAVED_REGS_MASK,-8
    256	dsubu	$sp,6*8
    257#endif
    258	sd	$s5,40($sp)
    259	sd	$s4,32($sp)
    260___
    261$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
    262	sd	$s3,24($sp)
    263	sd	$s2,16($sp)
    264	sd	$s1,8($sp)
    265	sd	$s0,0($sp)
    266___
    267$code.=<<___;
    268	.set	reorder
    269
    270#if defined(_MIPS_ARCH_MIPS64R6)
    271	andi	$shr,$inp,7
    272	dsubu	$inp,$inp,$shr		# align $inp
    273	sll	$shr,$shr,3		# byte to bit offset
    274	subu	$shl,$zero,$shr
    275#endif
    276
    277	ld	$h0,0($ctx)		# load hash value
    278	ld	$h1,8($ctx)
    279	ld	$h2,16($ctx)
    280
    281	ld	$r0,24($ctx)		# load key
    282	ld	$r1,32($ctx)
    283	ld	$rs1,40($ctx)
    284
    285	dsll	$len,4
    286	daddu	$len,$inp		# end of buffer
    287	b	.Loop
    288
    289.align	4
    290.Loop:
    291#if defined(_MIPS_ARCH_MIPS64R6)
    292	ld	$in0,0($inp)		# load input
    293	ld	$in1,8($inp)
    294	beqz	$shr,.Laligned_inp
    295
    296	ld	$tmp2,16($inp)
    297# ifdef	MIPSEB
    298	dsllv	$in0,$in0,$shr
    299	dsrlv	$tmp3,$in1,$shl
    300	dsllv	$in1,$in1,$shr
    301	dsrlv	$tmp2,$tmp2,$shl
    302# else
    303	dsrlv	$in0,$in0,$shr
    304	dsllv	$tmp3,$in1,$shl
    305	dsrlv	$in1,$in1,$shr
    306	dsllv	$tmp2,$tmp2,$shl
    307# endif
    308	or	$in0,$in0,$tmp3
    309	or	$in1,$in1,$tmp2
    310.Laligned_inp:
    311#else
    312	ldl	$in0,0+MSB($inp)	# load input
    313	ldl	$in1,8+MSB($inp)
    314	ldr	$in0,0+LSB($inp)
    315	ldr	$in1,8+LSB($inp)
    316#endif
    317	daddiu	$inp,16
    318#ifdef	MIPSEB
    319# if defined(_MIPS_ARCH_MIPS64R2)
    320	dsbh	$in0,$in0		# byte swap
    321	 dsbh	$in1,$in1
    322	dshd	$in0,$in0
    323	 dshd	$in1,$in1
    324# else
    325	ori	$tmp0,$zero,0xFF
    326	dsll	$tmp2,$tmp0,32
    327	or	$tmp0,$tmp2		# 0x000000FF000000FF
    328
    329	and	$tmp1,$in0,$tmp0	# byte swap
    330	 and	$tmp3,$in1,$tmp0
    331	dsrl	$tmp2,$in0,24
    332	 dsrl	$tmp4,$in1,24
    333	dsll	$tmp1,24
    334	 dsll	$tmp3,24
    335	and	$tmp2,$tmp0
    336	 and	$tmp4,$tmp0
    337	dsll	$tmp0,8			# 0x0000FF000000FF00
    338	or	$tmp1,$tmp2
    339	 or	$tmp3,$tmp4
    340	and	$tmp2,$in0,$tmp0
    341	 and	$tmp4,$in1,$tmp0
    342	dsrl	$in0,8
    343	 dsrl	$in1,8
    344	dsll	$tmp2,8
    345	 dsll	$tmp4,8
    346	and	$in0,$tmp0
    347	 and	$in1,$tmp0
    348	or	$tmp1,$tmp2
    349	 or	$tmp3,$tmp4
    350	or	$in0,$tmp1
    351	 or	$in1,$tmp3
    352	dsrl	$tmp1,$in0,32
    353	 dsrl	$tmp3,$in1,32
    354	dsll	$in0,32
    355	 dsll	$in1,32
    356	or	$in0,$tmp1
    357	 or	$in1,$tmp3
    358# endif
    359#endif
    360	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
    361	andi	$h2,$h2,3
    362	dsll	$tmp0,$tmp1,2
    363
    364	daddu	$d0,$h0,$in0		# accumulate input
    365	 daddu	$tmp1,$tmp0
    366	sltu	$tmp0,$d0,$h0
    367	daddu	$d0,$d0,$tmp1		# ... and residue
    368	sltu	$tmp1,$d0,$tmp1
    369	daddu	$d1,$h1,$in1
    370	daddu	$tmp0,$tmp1
    371	sltu	$tmp1,$d1,$h1
    372	daddu	$d1,$tmp0
    373
    374	dmultu	($r0,$d0)		# h0*r0
    375	 daddu	$d2,$h2,$padbit
    376	 sltu	$tmp0,$d1,$tmp0
    377	mflo	($h0,$r0,$d0)
    378	mfhi	($h1,$r0,$d0)
    379
    380	dmultu	($rs1,$d1)		# h1*5*r1
    381	 daddu	$d2,$tmp1
    382	 daddu	$d2,$tmp0
    383	mflo	($tmp0,$rs1,$d1)
    384	mfhi	($tmp1,$rs1,$d1)
    385
    386	dmultu	($r1,$d0)		# h0*r1
    387	mflo	($tmp2,$r1,$d0)
    388	mfhi	($h2,$r1,$d0)
    389	 daddu	$h0,$tmp0
    390	 daddu	$h1,$tmp1
    391	 sltu	$tmp0,$h0,$tmp0
    392
    393	dmultu	($r0,$d1)		# h1*r0
    394	 daddu	$h1,$tmp0
    395	 daddu	$h1,$tmp2
    396	mflo	($tmp0,$r0,$d1)
    397	mfhi	($tmp1,$r0,$d1)
    398
    399	dmultu	($rs1,$d2)		# h2*5*r1
    400	 sltu	$tmp2,$h1,$tmp2
    401	 daddu	$h2,$tmp2
    402	mflo	($tmp2,$rs1,$d2)
    403
    404	dmultu	($r0,$d2)		# h2*r0
    405	 daddu	$h1,$tmp0
    406	 daddu	$h2,$tmp1
    407	mflo	($tmp3,$r0,$d2)
    408	 sltu	$tmp0,$h1,$tmp0
    409	 daddu	$h2,$tmp0
    410
    411	daddu	$h1,$tmp2
    412	sltu	$tmp2,$h1,$tmp2
    413	daddu	$h2,$tmp2
    414	daddu	$h2,$tmp3
    415
    416	bne	$inp,$len,.Loop
    417
    418	sd	$h0,0($ctx)		# store hash value
    419	sd	$h1,8($ctx)
    420	sd	$h2,16($ctx)
    421
    422	.set	noreorder
    423#if defined(_MIPS_ARCH_MIPS64R6)
    424	ld	$s7,56($sp)
    425	ld	$s6,48($sp)
    426#endif
    427	ld	$s5,40($sp)		# epilogue
    428	ld	$s4,32($sp)
    429___
    430$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
    431	ld	$s3,24($sp)
    432	ld	$s2,16($sp)
    433	ld	$s1,8($sp)
    434	ld	$s0,0($sp)
    435___
    436$code.=<<___;
    437	jr	$ra
    438#if defined(_MIPS_ARCH_MIPS64R6)
    439	daddu	$sp,8*8
    440#else
    441	daddu	$sp,6*8
    442#endif
    443.end	poly1305_blocks_internal
    444___
    445}
    446{
    447my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
    448
    449$code.=<<___;
    450.align	5
    451.globl	poly1305_emit
    452.ent	poly1305_emit
    453poly1305_emit:
    454	.frame	$sp,0,$ra
    455	.set	reorder
    456
    457	ld	$tmp2,16($ctx)
    458	ld	$tmp0,0($ctx)
    459	ld	$tmp1,8($ctx)
    460
    461	li	$in0,-4			# final reduction
    462	dsrl	$in1,$tmp2,2
    463	and	$in0,$tmp2
    464	andi	$tmp2,$tmp2,3
    465	daddu	$in0,$in1
    466
    467	daddu	$tmp0,$tmp0,$in0
    468	sltu	$in1,$tmp0,$in0
    469	 daddiu	$in0,$tmp0,5		# compare to modulus
    470	daddu	$tmp1,$tmp1,$in1
    471	 sltiu	$tmp3,$in0,5
    472	sltu	$tmp4,$tmp1,$in1
    473	 daddu	$in1,$tmp1,$tmp3
    474	daddu	$tmp2,$tmp2,$tmp4
    475	 sltu	$tmp3,$in1,$tmp3
    476	 daddu	$tmp2,$tmp2,$tmp3
    477
    478	dsrl	$tmp2,2			# see if it carried/borrowed
    479	dsubu	$tmp2,$zero,$tmp2
    480
    481	xor	$in0,$tmp0
    482	xor	$in1,$tmp1
    483	and	$in0,$tmp2
    484	and	$in1,$tmp2
    485	xor	$in0,$tmp0
    486	xor	$in1,$tmp1
    487
    488	lwu	$tmp0,0($nonce)		# load nonce
    489	lwu	$tmp1,4($nonce)
    490	lwu	$tmp2,8($nonce)
    491	lwu	$tmp3,12($nonce)
    492	dsll	$tmp1,32
    493	dsll	$tmp3,32
    494	or	$tmp0,$tmp1
    495	or	$tmp2,$tmp3
    496
    497	daddu	$in0,$tmp0		# accumulate nonce
    498	daddu	$in1,$tmp2
    499	sltu	$tmp0,$in0,$tmp0
    500	daddu	$in1,$tmp0
    501
    502	dsrl	$tmp0,$in0,8		# write mac value
    503	dsrl	$tmp1,$in0,16
    504	dsrl	$tmp2,$in0,24
    505	sb	$in0,0($mac)
    506	dsrl	$tmp3,$in0,32
    507	sb	$tmp0,1($mac)
    508	dsrl	$tmp0,$in0,40
    509	sb	$tmp1,2($mac)
    510	dsrl	$tmp1,$in0,48
    511	sb	$tmp2,3($mac)
    512	dsrl	$tmp2,$in0,56
    513	sb	$tmp3,4($mac)
    514	dsrl	$tmp3,$in1,8
    515	sb	$tmp0,5($mac)
    516	dsrl	$tmp0,$in1,16
    517	sb	$tmp1,6($mac)
    518	dsrl	$tmp1,$in1,24
    519	sb	$tmp2,7($mac)
    520
    521	sb	$in1,8($mac)
    522	dsrl	$tmp2,$in1,32
    523	sb	$tmp3,9($mac)
    524	dsrl	$tmp3,$in1,40
    525	sb	$tmp0,10($mac)
    526	dsrl	$tmp0,$in1,48
    527	sb	$tmp1,11($mac)
    528	dsrl	$tmp1,$in1,56
    529	sb	$tmp2,12($mac)
    530	sb	$tmp3,13($mac)
    531	sb	$tmp0,14($mac)
    532	sb	$tmp1,15($mac)
    533
    534	jr	$ra
    535.end	poly1305_emit
    536.rdata
    537.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
    538.align	2
    539___
    540}
    541}}} else {{{
    542######################################################################
    543# 32-bit code path
    544#
    545
    546my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
    547my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
    548   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
    549
    550$code.=<<___;
    551#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
    552     defined(_MIPS_ARCH_MIPS32R6)) \\
    553     && !defined(_MIPS_ARCH_MIPS32R2)
    554# define _MIPS_ARCH_MIPS32R2
    555#endif
    556
    557#if defined(_MIPS_ARCH_MIPS32R6)
    558# define multu(rs,rt)
    559# define mflo(rd,rs,rt)	mulu	rd,rs,rt
    560# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
    561#else
    562# define multu(rs,rt)	multu	rs,rt
    563# define mflo(rd,rs,rt)	mflo	rd
    564# define mfhi(rd,rs,rt)	mfhi	rd
    565#endif
    566
    567#ifdef	__KERNEL__
    568# define poly1305_init   poly1305_init_mips
    569# define poly1305_blocks poly1305_blocks_mips
    570# define poly1305_emit   poly1305_emit_mips
    571#endif
    572
    573#if defined(__MIPSEB__) && !defined(MIPSEB)
    574# define MIPSEB
    575#endif
    576
    577#ifdef MIPSEB
    578# define MSB 0
    579# define LSB 3
    580#else
    581# define MSB 3
    582# define LSB 0
    583#endif
    584
    585.text
    586.set	noat
    587.set	noreorder
    588
    589.align	5
    590.globl	poly1305_init
    591.ent	poly1305_init
    592poly1305_init:
    593	.frame	$sp,0,$ra
    594	.set	reorder
    595
    596	sw	$zero,0($ctx)
    597	sw	$zero,4($ctx)
    598	sw	$zero,8($ctx)
    599	sw	$zero,12($ctx)
    600	sw	$zero,16($ctx)
    601
    602	beqz	$inp,.Lno_key
    603
    604#if defined(_MIPS_ARCH_MIPS32R6)
    605	andi	$tmp0,$inp,3		# $inp % 4
    606	subu	$inp,$inp,$tmp0		# align $inp
    607	sll	$tmp0,$tmp0,3		# byte to bit offset
    608	lw	$in0,0($inp)
    609	lw	$in1,4($inp)
    610	lw	$in2,8($inp)
    611	lw	$in3,12($inp)
    612	beqz	$tmp0,.Laligned_key
    613
    614	lw	$tmp2,16($inp)
    615	subu	$tmp1,$zero,$tmp0
    616# ifdef	MIPSEB
    617	sllv	$in0,$in0,$tmp0
    618	srlv	$tmp3,$in1,$tmp1
    619	sllv	$in1,$in1,$tmp0
    620	or	$in0,$in0,$tmp3
    621	srlv	$tmp3,$in2,$tmp1
    622	sllv	$in2,$in2,$tmp0
    623	or	$in1,$in1,$tmp3
    624	srlv	$tmp3,$in3,$tmp1
    625	sllv	$in3,$in3,$tmp0
    626	or	$in2,$in2,$tmp3
    627	srlv	$tmp2,$tmp2,$tmp1
    628	or	$in3,$in3,$tmp2
    629# else
    630	srlv	$in0,$in0,$tmp0
    631	sllv	$tmp3,$in1,$tmp1
    632	srlv	$in1,$in1,$tmp0
    633	or	$in0,$in0,$tmp3
    634	sllv	$tmp3,$in2,$tmp1
    635	srlv	$in2,$in2,$tmp0
    636	or	$in1,$in1,$tmp3
    637	sllv	$tmp3,$in3,$tmp1
    638	srlv	$in3,$in3,$tmp0
    639	or	$in2,$in2,$tmp3
    640	sllv	$tmp2,$tmp2,$tmp1
    641	or	$in3,$in3,$tmp2
    642# endif
    643.Laligned_key:
    644#else
    645	lwl	$in0,0+MSB($inp)
    646	lwl	$in1,4+MSB($inp)
    647	lwl	$in2,8+MSB($inp)
    648	lwl	$in3,12+MSB($inp)
    649	lwr	$in0,0+LSB($inp)
    650	lwr	$in1,4+LSB($inp)
    651	lwr	$in2,8+LSB($inp)
    652	lwr	$in3,12+LSB($inp)
    653#endif
    654#ifdef	MIPSEB
    655# if defined(_MIPS_ARCH_MIPS32R2)
    656	wsbh	$in0,$in0		# byte swap
    657	wsbh	$in1,$in1
    658	wsbh	$in2,$in2
    659	wsbh	$in3,$in3
    660	rotr	$in0,$in0,16
    661	rotr	$in1,$in1,16
    662	rotr	$in2,$in2,16
    663	rotr	$in3,$in3,16
    664# else
    665	srl	$tmp0,$in0,24		# byte swap
    666	srl	$tmp1,$in0,8
    667	andi	$tmp2,$in0,0xFF00
    668	sll	$in0,$in0,24
    669	andi	$tmp1,0xFF00
    670	sll	$tmp2,$tmp2,8
    671	or	$in0,$tmp0
    672	 srl	$tmp0,$in1,24
    673	or	$tmp1,$tmp2
    674	 srl	$tmp2,$in1,8
    675	or	$in0,$tmp1
    676	 andi	$tmp1,$in1,0xFF00
    677	 sll	$in1,$in1,24
    678	 andi	$tmp2,0xFF00
    679	 sll	$tmp1,$tmp1,8
    680	 or	$in1,$tmp0
    681	srl	$tmp0,$in2,24
    682	 or	$tmp2,$tmp1
    683	srl	$tmp1,$in2,8
    684	 or	$in1,$tmp2
    685	andi	$tmp2,$in2,0xFF00
    686	sll	$in2,$in2,24
    687	andi	$tmp1,0xFF00
    688	sll	$tmp2,$tmp2,8
    689	or	$in2,$tmp0
    690	 srl	$tmp0,$in3,24
    691	or	$tmp1,$tmp2
    692	 srl	$tmp2,$in3,8
    693	or	$in2,$tmp1
    694	 andi	$tmp1,$in3,0xFF00
    695	 sll	$in3,$in3,24
    696	 andi	$tmp2,0xFF00
    697	 sll	$tmp1,$tmp1,8
    698	 or	$in3,$tmp0
    699	 or	$tmp2,$tmp1
    700	 or	$in3,$tmp2
    701# endif
    702#endif
    703	lui	$tmp0,0x0fff
    704	ori	$tmp0,0xffff		# 0x0fffffff
    705	and	$in0,$in0,$tmp0
    706	subu	$tmp0,3			# 0x0ffffffc
    707	and	$in1,$in1,$tmp0
    708	and	$in2,$in2,$tmp0
    709	and	$in3,$in3,$tmp0
    710
    711	sw	$in0,20($ctx)
    712	sw	$in1,24($ctx)
    713	sw	$in2,28($ctx)
    714	sw	$in3,32($ctx)
    715
    716	srl	$tmp1,$in1,2
    717	srl	$tmp2,$in2,2
    718	srl	$tmp3,$in3,2
    719	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
    720	addu	$in2,$in2,$tmp2
    721	addu	$in3,$in3,$tmp3
    722	sw	$in1,36($ctx)
    723	sw	$in2,40($ctx)
    724	sw	$in3,44($ctx)
    725.Lno_key:
    726	li	$v0,0
    727	jr	$ra
    728.end	poly1305_init
    729___
    730{
    731my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
    732
    733my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
    734   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
    735my ($d0,$d1,$d2,$d3) =
    736   ($a4,$a5,$a6,$a7);
    737my $shr = $t2;		# used on R6
    738my $one = $t2;		# used on R2
    739
    740$code.=<<___;
    741.globl	poly1305_blocks
    742.align	5
    743.ent	poly1305_blocks
    744poly1305_blocks:
    745	.frame	$sp,16*4,$ra
    746	.mask	$SAVED_REGS_MASK,-4
    747	.set	noreorder
    748	subu	$sp, $sp,4*12
    749	sw	$s11,4*11($sp)
    750	sw	$s10,4*10($sp)
    751	sw	$s9, 4*9($sp)
    752	sw	$s8, 4*8($sp)
    753	sw	$s7, 4*7($sp)
    754	sw	$s6, 4*6($sp)
    755	sw	$s5, 4*5($sp)
    756	sw	$s4, 4*4($sp)
    757___
    758$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
    759	sw	$s3, 4*3($sp)
    760	sw	$s2, 4*2($sp)
    761	sw	$s1, 4*1($sp)
    762	sw	$s0, 4*0($sp)
    763___
    764$code.=<<___;
    765	.set	reorder
    766
    767	srl	$len,4			# number of complete blocks
    768	li	$one,1
    769	beqz	$len,.Labort
    770
    771#if defined(_MIPS_ARCH_MIPS32R6)
    772	andi	$shr,$inp,3
    773	subu	$inp,$inp,$shr		# align $inp
    774	sll	$shr,$shr,3		# byte to bit offset
    775#endif
    776
    777	lw	$h0,0($ctx)		# load hash value
    778	lw	$h1,4($ctx)
    779	lw	$h2,8($ctx)
    780	lw	$h3,12($ctx)
    781	lw	$h4,16($ctx)
    782
    783	lw	$r0,20($ctx)		# load key
    784	lw	$r1,24($ctx)
    785	lw	$r2,28($ctx)
    786	lw	$r3,32($ctx)
    787	lw	$rs1,36($ctx)
    788	lw	$rs2,40($ctx)
    789	lw	$rs3,44($ctx)
    790
    791	sll	$len,4
    792	addu	$len,$len,$inp		# end of buffer
    793	b	.Loop
    794
    795.align	4
    796.Loop:
    797#if defined(_MIPS_ARCH_MIPS32R6)
    798	lw	$d0,0($inp)		# load input
    799	lw	$d1,4($inp)
    800	lw	$d2,8($inp)
    801	lw	$d3,12($inp)
    802	beqz	$shr,.Laligned_inp
    803
    804	lw	$t0,16($inp)
    805	subu	$t1,$zero,$shr
    806# ifdef	MIPSEB
    807	sllv	$d0,$d0,$shr
    808	srlv	$at,$d1,$t1
    809	sllv	$d1,$d1,$shr
    810	or	$d0,$d0,$at
    811	srlv	$at,$d2,$t1
    812	sllv	$d2,$d2,$shr
    813	or	$d1,$d1,$at
    814	srlv	$at,$d3,$t1
    815	sllv	$d3,$d3,$shr
    816	or	$d2,$d2,$at
    817	srlv	$t0,$t0,$t1
    818	or	$d3,$d3,$t0
    819# else
    820	srlv	$d0,$d0,$shr
    821	sllv	$at,$d1,$t1
    822	srlv	$d1,$d1,$shr
    823	or	$d0,$d0,$at
    824	sllv	$at,$d2,$t1
    825	srlv	$d2,$d2,$shr
    826	or	$d1,$d1,$at
    827	sllv	$at,$d3,$t1
    828	srlv	$d3,$d3,$shr
    829	or	$d2,$d2,$at
    830	sllv	$t0,$t0,$t1
    831	or	$d3,$d3,$t0
    832# endif
    833.Laligned_inp:
    834#else
    835	lwl	$d0,0+MSB($inp)		# load input
    836	lwl	$d1,4+MSB($inp)
    837	lwl	$d2,8+MSB($inp)
    838	lwl	$d3,12+MSB($inp)
    839	lwr	$d0,0+LSB($inp)
    840	lwr	$d1,4+LSB($inp)
    841	lwr	$d2,8+LSB($inp)
    842	lwr	$d3,12+LSB($inp)
    843#endif
    844#ifdef	MIPSEB
    845# if defined(_MIPS_ARCH_MIPS32R2)
    846	wsbh	$d0,$d0			# byte swap
    847	wsbh	$d1,$d1
    848	wsbh	$d2,$d2
    849	wsbh	$d3,$d3
    850	rotr	$d0,$d0,16
    851	rotr	$d1,$d1,16
    852	rotr	$d2,$d2,16
    853	rotr	$d3,$d3,16
    854# else
    855	srl	$at,$d0,24		# byte swap
    856	srl	$t0,$d0,8
    857	andi	$t1,$d0,0xFF00
    858	sll	$d0,$d0,24
    859	andi	$t0,0xFF00
    860	sll	$t1,$t1,8
    861	or	$d0,$at
    862	 srl	$at,$d1,24
    863	or	$t0,$t1
    864	 srl	$t1,$d1,8
    865	or	$d0,$t0
    866	 andi	$t0,$d1,0xFF00
    867	 sll	$d1,$d1,24
    868	 andi	$t1,0xFF00
    869	 sll	$t0,$t0,8
    870	 or	$d1,$at
    871	srl	$at,$d2,24
    872	 or	$t1,$t0
    873	srl	$t0,$d2,8
    874	 or	$d1,$t1
    875	andi	$t1,$d2,0xFF00
    876	sll	$d2,$d2,24
    877	andi	$t0,0xFF00
    878	sll	$t1,$t1,8
    879	or	$d2,$at
    880	 srl	$at,$d3,24
    881	or	$t0,$t1
    882	 srl	$t1,$d3,8
    883	or	$d2,$t0
    884	 andi	$t0,$d3,0xFF00
    885	 sll	$d3,$d3,24
    886	 andi	$t1,0xFF00
    887	 sll	$t0,$t0,8
    888	 or	$d3,$at
    889	 or	$t1,$t0
    890	 or	$d3,$t1
    891# endif
    892#endif
    893	srl	$t0,$h4,2		# modulo-scheduled reduction
    894	andi	$h4,$h4,3
    895	sll	$at,$t0,2
    896
    897	addu	$d0,$d0,$h0		# accumulate input
    898	 addu	$t0,$t0,$at
    899	sltu	$h0,$d0,$h0
    900	addu	$d0,$d0,$t0		# ... and residue
    901	sltu	$at,$d0,$t0
    902
    903	addu	$d1,$d1,$h1
    904	 addu	$h0,$h0,$at		# carry
    905	sltu	$h1,$d1,$h1
    906	addu	$d1,$d1,$h0
    907	sltu	$h0,$d1,$h0
    908
    909	addu	$d2,$d2,$h2
    910	 addu	$h1,$h1,$h0		# carry
    911	sltu	$h2,$d2,$h2
    912	addu	$d2,$d2,$h1
    913	sltu	$h1,$d2,$h1
    914
    915	addu	$d3,$d3,$h3
    916	 addu	$h2,$h2,$h1		# carry
    917	sltu	$h3,$d3,$h3
    918	addu	$d3,$d3,$h2
    919
    920#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
    921	multu	$r0,$d0			# d0*r0
    922	 sltu	$h2,$d3,$h2
    923	maddu	$rs3,$d1		# d1*s3
    924	 addu	$h3,$h3,$h2		# carry
    925	maddu	$rs2,$d2		# d2*s2
    926	 addu	$h4,$h4,$padbit
    927	maddu	$rs1,$d3		# d3*s1
    928	 addu	$h4,$h4,$h3
    929	mfhi	$at
    930	mflo	$h0
    931
    932	multu	$r1,$d0			# d0*r1
    933	maddu	$r0,$d1			# d1*r0
    934	maddu	$rs3,$d2		# d2*s3
    935	maddu	$rs2,$d3		# d3*s2
    936	maddu	$rs1,$h4		# h4*s1
    937	maddu	$at,$one		# hi*1
    938	mfhi	$at
    939	mflo	$h1
    940
    941	multu	$r2,$d0			# d0*r2
    942	maddu	$r1,$d1			# d1*r1
    943	maddu	$r0,$d2			# d2*r0
    944	maddu	$rs3,$d3		# d3*s3
    945	maddu	$rs2,$h4		# h4*s2
    946	maddu	$at,$one		# hi*1
    947	mfhi	$at
    948	mflo	$h2
    949
    950	mul	$t0,$r0,$h4		# h4*r0
    951
    952	multu	$r3,$d0			# d0*r3
    953	maddu	$r2,$d1			# d1*r2
    954	maddu	$r1,$d2			# d2*r1
    955	maddu	$r0,$d3			# d3*r0
    956	maddu	$rs3,$h4		# h4*s3
    957	maddu	$at,$one		# hi*1
    958	mfhi	$at
    959	mflo	$h3
    960
    961	 addiu	$inp,$inp,16
    962
    963	addu	$h4,$t0,$at
    964#else
    965	multu	($r0,$d0)		# d0*r0
    966	mflo	($h0,$r0,$d0)
    967	mfhi	($h1,$r0,$d0)
    968
    969	 sltu	$h2,$d3,$h2
    970	 addu	$h3,$h3,$h2		# carry
    971
    972	multu	($rs3,$d1)		# d1*s3
    973	mflo	($at,$rs3,$d1)
    974	mfhi	($t0,$rs3,$d1)
    975
    976	 addu	$h4,$h4,$padbit
    977	 addiu	$inp,$inp,16
    978	 addu	$h4,$h4,$h3
    979
    980	multu	($rs2,$d2)		# d2*s2
    981	mflo	($a3,$rs2,$d2)
    982	mfhi	($t1,$rs2,$d2)
    983	 addu	$h0,$h0,$at
    984	 addu	$h1,$h1,$t0
    985	multu	($rs1,$d3)		# d3*s1
    986	 sltu	$at,$h0,$at
    987	 addu	$h1,$h1,$at
    988
    989	mflo	($at,$rs1,$d3)
    990	mfhi	($t0,$rs1,$d3)
    991	 addu	$h0,$h0,$a3
    992	 addu	$h1,$h1,$t1
    993	multu	($r1,$d0)		# d0*r1
    994	 sltu	$a3,$h0,$a3
    995	 addu	$h1,$h1,$a3
    996
    997
    998	mflo	($a3,$r1,$d0)
    999	mfhi	($h2,$r1,$d0)
   1000	 addu	$h0,$h0,$at
   1001	 addu	$h1,$h1,$t0
   1002	multu	($r0,$d1)		# d1*r0
   1003	 sltu	$at,$h0,$at
   1004	 addu	$h1,$h1,$at
   1005
   1006	mflo	($at,$r0,$d1)
   1007	mfhi	($t0,$r0,$d1)
   1008	 addu	$h1,$h1,$a3
   1009	 sltu	$a3,$h1,$a3
   1010	multu	($rs3,$d2)		# d2*s3
   1011	 addu	$h2,$h2,$a3
   1012
   1013	mflo	($a3,$rs3,$d2)
   1014	mfhi	($t1,$rs3,$d2)
   1015	 addu	$h1,$h1,$at
   1016	 addu	$h2,$h2,$t0
   1017	multu	($rs2,$d3)		# d3*s2
   1018	 sltu	$at,$h1,$at
   1019	 addu	$h2,$h2,$at
   1020
   1021	mflo	($at,$rs2,$d3)
   1022	mfhi	($t0,$rs2,$d3)
   1023	 addu	$h1,$h1,$a3
   1024	 addu	$h2,$h2,$t1
   1025	multu	($rs1,$h4)		# h4*s1
   1026	 sltu	$a3,$h1,$a3
   1027	 addu	$h2,$h2,$a3
   1028
   1029	mflo	($a3,$rs1,$h4)
   1030	 addu	$h1,$h1,$at
   1031	 addu	$h2,$h2,$t0
   1032	multu	($r2,$d0)		# d0*r2
   1033	 sltu	$at,$h1,$at
   1034	 addu	$h2,$h2,$at
   1035
   1036
   1037	mflo	($at,$r2,$d0)
   1038	mfhi	($h3,$r2,$d0)
   1039	 addu	$h1,$h1,$a3
   1040	 sltu	$a3,$h1,$a3
   1041	multu	($r1,$d1)		# d1*r1
   1042	 addu	$h2,$h2,$a3
   1043
   1044	mflo	($a3,$r1,$d1)
   1045	mfhi	($t1,$r1,$d1)
   1046	 addu	$h2,$h2,$at
   1047	 sltu	$at,$h2,$at
   1048	multu	($r0,$d2)		# d2*r0
   1049	 addu	$h3,$h3,$at
   1050
   1051	mflo	($at,$r0,$d2)
   1052	mfhi	($t0,$r0,$d2)
   1053	 addu	$h2,$h2,$a3
   1054	 addu	$h3,$h3,$t1
   1055	multu	($rs3,$d3)		# d3*s3
   1056	 sltu	$a3,$h2,$a3
   1057	 addu	$h3,$h3,$a3
   1058
   1059	mflo	($a3,$rs3,$d3)
   1060	mfhi	($t1,$rs3,$d3)
   1061	 addu	$h2,$h2,$at
   1062	 addu	$h3,$h3,$t0
   1063	multu	($rs2,$h4)		# h4*s2
   1064	 sltu	$at,$h2,$at
   1065	 addu	$h3,$h3,$at
   1066
   1067	mflo	($at,$rs2,$h4)
   1068	 addu	$h2,$h2,$a3
   1069	 addu	$h3,$h3,$t1
   1070	multu	($r3,$d0)		# d0*r3
   1071	 sltu	$a3,$h2,$a3
   1072	 addu	$h3,$h3,$a3
   1073
   1074
   1075	mflo	($a3,$r3,$d0)
   1076	mfhi	($t1,$r3,$d0)
   1077	 addu	$h2,$h2,$at
   1078	 sltu	$at,$h2,$at
   1079	multu	($r2,$d1)		# d1*r2
   1080	 addu	$h3,$h3,$at
   1081
   1082	mflo	($at,$r2,$d1)
   1083	mfhi	($t0,$r2,$d1)
   1084	 addu	$h3,$h3,$a3
   1085	 sltu	$a3,$h3,$a3
   1086	multu	($r0,$d3)		# d3*r0
   1087	 addu	$t1,$t1,$a3
   1088
   1089	mflo	($a3,$r0,$d3)
   1090	mfhi	($d3,$r0,$d3)
   1091	 addu	$h3,$h3,$at
   1092	 addu	$t1,$t1,$t0
   1093	multu	($r1,$d2)		# d2*r1
   1094	 sltu	$at,$h3,$at
   1095	 addu	$t1,$t1,$at
   1096
   1097	mflo	($at,$r1,$d2)
   1098	mfhi	($t0,$r1,$d2)
   1099	 addu	$h3,$h3,$a3
   1100	 addu	$t1,$t1,$d3
   1101	multu	($rs3,$h4)		# h4*s3
   1102	 sltu	$a3,$h3,$a3
   1103	 addu	$t1,$t1,$a3
   1104
   1105	mflo	($a3,$rs3,$h4)
   1106	 addu	$h3,$h3,$at
   1107	 addu	$t1,$t1,$t0
   1108	multu	($r0,$h4)		# h4*r0
   1109	 sltu	$at,$h3,$at
   1110	 addu	$t1,$t1,$at
   1111
   1112
   1113	mflo	($h4,$r0,$h4)
   1114	 addu	$h3,$h3,$a3
   1115	 sltu	$a3,$h3,$a3
   1116	 addu	$t1,$t1,$a3
   1117	addu	$h4,$h4,$t1
   1118
   1119	li	$padbit,1		# if we loop, padbit is 1
   1120#endif
   1121	bne	$inp,$len,.Loop
   1122
   1123	sw	$h0,0($ctx)		# store hash value
   1124	sw	$h1,4($ctx)
   1125	sw	$h2,8($ctx)
   1126	sw	$h3,12($ctx)
   1127	sw	$h4,16($ctx)
   1128
   1129	.set	noreorder
   1130.Labort:
   1131	lw	$s11,4*11($sp)
   1132	lw	$s10,4*10($sp)
   1133	lw	$s9, 4*9($sp)
   1134	lw	$s8, 4*8($sp)
   1135	lw	$s7, 4*7($sp)
   1136	lw	$s6, 4*6($sp)
   1137	lw	$s5, 4*5($sp)
   1138	lw	$s4, 4*4($sp)
   1139___
   1140$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
   1141	lw	$s3, 4*3($sp)
   1142	lw	$s2, 4*2($sp)
   1143	lw	$s1, 4*1($sp)
   1144	lw	$s0, 4*0($sp)
   1145___
   1146$code.=<<___;
   1147	jr	$ra
   1148	addu	$sp,$sp,4*12
   1149.end	poly1305_blocks
   1150___
   1151}
   1152{
   1153my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
   1154
   1155$code.=<<___;
   1156.align	5
   1157.globl	poly1305_emit
   1158.ent	poly1305_emit
   1159poly1305_emit:
   1160	.frame	$sp,0,$ra
   1161	.set	reorder
   1162
   1163	lw	$tmp4,16($ctx)
   1164	lw	$tmp0,0($ctx)
   1165	lw	$tmp1,4($ctx)
   1166	lw	$tmp2,8($ctx)
   1167	lw	$tmp3,12($ctx)
   1168
   1169	li	$in0,-4			# final reduction
   1170	srl	$ctx,$tmp4,2
   1171	and	$in0,$in0,$tmp4
   1172	andi	$tmp4,$tmp4,3
   1173	addu	$ctx,$ctx,$in0
   1174
   1175	addu	$tmp0,$tmp0,$ctx
   1176	sltu	$ctx,$tmp0,$ctx
   1177	 addiu	$in0,$tmp0,5		# compare to modulus
   1178	addu	$tmp1,$tmp1,$ctx
   1179	 sltiu	$in1,$in0,5
   1180	sltu	$ctx,$tmp1,$ctx
   1181	 addu	$in1,$in1,$tmp1
   1182	addu	$tmp2,$tmp2,$ctx
   1183	 sltu	$in2,$in1,$tmp1
   1184	sltu	$ctx,$tmp2,$ctx
   1185	 addu	$in2,$in2,$tmp2
   1186	addu	$tmp3,$tmp3,$ctx
   1187	 sltu	$in3,$in2,$tmp2
   1188	sltu	$ctx,$tmp3,$ctx
   1189	 addu	$in3,$in3,$tmp3
   1190	addu	$tmp4,$tmp4,$ctx
   1191	 sltu	$ctx,$in3,$tmp3
   1192	 addu	$ctx,$tmp4
   1193
   1194	srl	$ctx,2			# see if it carried/borrowed
   1195	subu	$ctx,$zero,$ctx
   1196
   1197	xor	$in0,$tmp0
   1198	xor	$in1,$tmp1
   1199	xor	$in2,$tmp2
   1200	xor	$in3,$tmp3
   1201	and	$in0,$ctx
   1202	and	$in1,$ctx
   1203	and	$in2,$ctx
   1204	and	$in3,$ctx
   1205	xor	$in0,$tmp0
   1206	xor	$in1,$tmp1
   1207	xor	$in2,$tmp2
   1208	xor	$in3,$tmp3
   1209
   1210	lw	$tmp0,0($nonce)		# load nonce
   1211	lw	$tmp1,4($nonce)
   1212	lw	$tmp2,8($nonce)
   1213	lw	$tmp3,12($nonce)
   1214
   1215	addu	$in0,$tmp0		# accumulate nonce
   1216	sltu	$ctx,$in0,$tmp0
   1217
   1218	addu	$in1,$tmp1
   1219	sltu	$tmp1,$in1,$tmp1
   1220	addu	$in1,$ctx
   1221	sltu	$ctx,$in1,$ctx
   1222	addu	$ctx,$tmp1
   1223
   1224	addu	$in2,$tmp2
   1225	sltu	$tmp2,$in2,$tmp2
   1226	addu	$in2,$ctx
   1227	sltu	$ctx,$in2,$ctx
   1228	addu	$ctx,$tmp2
   1229
   1230	addu	$in3,$tmp3
   1231	addu	$in3,$ctx
   1232
   1233	srl	$tmp0,$in0,8		# write mac value
   1234	srl	$tmp1,$in0,16
   1235	srl	$tmp2,$in0,24
   1236	sb	$in0, 0($mac)
   1237	sb	$tmp0,1($mac)
   1238	srl	$tmp0,$in1,8
   1239	sb	$tmp1,2($mac)
   1240	srl	$tmp1,$in1,16
   1241	sb	$tmp2,3($mac)
   1242	srl	$tmp2,$in1,24
   1243	sb	$in1, 4($mac)
   1244	sb	$tmp0,5($mac)
   1245	srl	$tmp0,$in2,8
   1246	sb	$tmp1,6($mac)
   1247	srl	$tmp1,$in2,16
   1248	sb	$tmp2,7($mac)
   1249	srl	$tmp2,$in2,24
   1250	sb	$in2, 8($mac)
   1251	sb	$tmp0,9($mac)
   1252	srl	$tmp0,$in3,8
   1253	sb	$tmp1,10($mac)
   1254	srl	$tmp1,$in3,16
   1255	sb	$tmp2,11($mac)
   1256	srl	$tmp2,$in3,24
   1257	sb	$in3, 12($mac)
   1258	sb	$tmp0,13($mac)
   1259	sb	$tmp1,14($mac)
   1260	sb	$tmp2,15($mac)
   1261
   1262	jr	$ra
   1263.end	poly1305_emit
   1264.rdata
   1265.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
   1266.align	2
   1267___
   1268}
   1269}}}
   1270
   1271$output=pop and open STDOUT,">$output";
   1272print $code;
   1273close STDOUT;