cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sha256-armv4.pl (18403B)


      1#!/usr/bin/env perl
      2# SPDX-License-Identifier: GPL-2.0
      3
      4# This code is taken from the OpenSSL project but the author (Andy Polyakov)
      5# has relicensed it under the GPLv2. Therefore this program is free software;
      6# you can redistribute it and/or modify it under the terms of the GNU General
      7# Public License version 2 as published by the Free Software Foundation.
      8#
      9# The original headers, including the original license headers, are
     10# included below for completeness.
     11
     12# ====================================================================
     13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     14# project. The module is, however, dual licensed under OpenSSL and
     15# CRYPTOGAMS licenses depending on where you obtain it. For further
     16# details see https://www.openssl.org/~appro/cryptogams/.
     17# ====================================================================
     18
     19# SHA256 block procedure for ARMv4. May 2007.
     20
     21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
     22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
     23# byte [on single-issue Xscale PXA250 core].
     24
     25# July 2010.
     26#
     27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
     28# Cortex A8 core and ~20 cycles per processed byte.
     29
     30# February 2011.
     31#
     32# Profiler-assisted and platform-specific optimization resulted in 16%
     33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
     34
     35# September 2013.
     36#
     37# Add NEON implementation. On Cortex A8 it was measured to process one
     38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
     39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
     40# code (meaning that latter performs sub-optimally, nothing was done
     41# about it).
     42
     43# May 2014.
     44#
     45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
     46
     47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     48open STDOUT,">$output";
     49
     50$ctx="r0";	$t0="r0";
     51$inp="r1";	$t4="r1";
     52$len="r2";	$t1="r2";
     53$T1="r3";	$t3="r3";
     54$A="r4";
     55$B="r5";
     56$C="r6";
     57$D="r7";
     58$E="r8";
     59$F="r9";
     60$G="r10";
     61$H="r11";
     62@V=($A,$B,$C,$D,$E,$F,$G,$H);
     63$t2="r12";
     64$Ktbl="r14";
     65
     66@Sigma0=( 2,13,22);
     67@Sigma1=( 6,11,25);
     68@sigma0=( 7,18, 3);
     69@sigma1=(17,19,10);
     70
     71sub BODY_00_15 {
     72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
     73
     74$code.=<<___ if ($i<16);
     75#if __ARM_ARCH__>=7
     76	@ ldr	$t1,[$inp],#4			@ $i
     77# if $i==15
     78	str	$inp,[sp,#17*4]			@ make room for $t4
     79# endif
     80	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
     81	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
     82	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
     83# ifndef __ARMEB__
     84	rev	$t1,$t1
     85# endif
     86#else
     87	@ ldrb	$t1,[$inp,#3]			@ $i
     88	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
     89	ldrb	$t2,[$inp,#2]
     90	ldrb	$t0,[$inp,#1]
     91	orr	$t1,$t1,$t2,lsl#8
     92	ldrb	$t2,[$inp],#4
     93	orr	$t1,$t1,$t0,lsl#16
     94# if $i==15
     95	str	$inp,[sp,#17*4]			@ make room for $t4
     96# endif
     97	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
     98	orr	$t1,$t1,$t2,lsl#24
     99	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
    100#endif
    101___
    102$code.=<<___;
    103	ldr	$t2,[$Ktbl],#4			@ *K256++
    104	add	$h,$h,$t1			@ h+=X[i]
    105	str	$t1,[sp,#`$i%16`*4]
    106	eor	$t1,$f,$g
    107	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
    108	and	$t1,$t1,$e
    109	add	$h,$h,$t2			@ h+=K256[i]
    110	eor	$t1,$t1,$g			@ Ch(e,f,g)
    111	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
    112	add	$h,$h,$t1			@ h+=Ch(e,f,g)
    113#if $i==31
    114	and	$t2,$t2,#0xff
    115	cmp	$t2,#0xf2			@ done?
    116#endif
    117#if $i<15
    118# if __ARM_ARCH__>=7
    119	ldr	$t1,[$inp],#4			@ prefetch
    120# else
    121	ldrb	$t1,[$inp,#3]
    122# endif
    123	eor	$t2,$a,$b			@ a^b, b^c in next round
    124#else
    125	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
    126	eor	$t2,$a,$b			@ a^b, b^c in next round
    127	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
    128#endif
    129	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
    130	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
    131	add	$d,$d,$h			@ d+=h
    132	eor	$t3,$t3,$b			@ Maj(a,b,c)
    133	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
    134	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
    135___
    136	($t2,$t3)=($t3,$t2);
    137}
    138
    139sub BODY_16_XX {
    140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    141
    142$code.=<<___;
    143	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
    144	@ ldr	$t4,[sp,#`($i+14)%16`*4]
    145	mov	$t0,$t1,ror#$sigma0[0]
    146	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
    147	mov	$t2,$t4,ror#$sigma1[0]
    148	eor	$t0,$t0,$t1,ror#$sigma0[1]
    149	eor	$t2,$t2,$t4,ror#$sigma1[1]
    150	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
    151	ldr	$t1,[sp,#`($i+0)%16`*4]
    152	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
    153	ldr	$t4,[sp,#`($i+9)%16`*4]
    154
    155	add	$t2,$t2,$t0
    156	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
    157	add	$t1,$t1,$t2
    158	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
    159	add	$t1,$t1,$t4			@ X[i]
    160___
    161	&BODY_00_15(@_);
    162}
    163
    164$code=<<___;
    165#ifndef __KERNEL__
    166# include "arm_arch.h"
    167#else
    168# define __ARM_ARCH__ __LINUX_ARM_ARCH__
    169# define __ARM_MAX_ARCH__ 7
    170#endif
    171
    172.text
    173#if __ARM_ARCH__<7
    174.code	32
    175#else
    176.syntax unified
    177# ifdef __thumb2__
    178.thumb
    179# else
    180.code   32
    181# endif
    182#endif
    183
    184.type	K256,%object
    185.align	5
    186K256:
    187.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    188.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    189.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    190.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    191.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    192.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    193.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    194.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    195.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    196.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    197.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    198.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    199.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    200.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    201.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    202.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    203.size	K256,.-K256
    204.word	0				@ terminator
    205#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    206.LOPENSSL_armcap:
    207.word	OPENSSL_armcap_P-sha256_block_data_order
    208#endif
    209.align	5
    210
    211.global	sha256_block_data_order
    212.type	sha256_block_data_order,%function
    213sha256_block_data_order:
    214.Lsha256_block_data_order:
    215#if __ARM_ARCH__<7
    216	sub	r3,pc,#8		@ sha256_block_data_order
    217#else
    218	adr	r3,.Lsha256_block_data_order
    219#endif
    220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    221	ldr	r12,.LOPENSSL_armcap
    222	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    223	tst	r12,#ARMV8_SHA256
    224	bne	.LARMv8
    225	tst	r12,#ARMV7_NEON
    226	bne	.LNEON
    227#endif
    228	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    229	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
    230	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
    231	sub	$Ktbl,r3,#256+32	@ K256
    232	sub	sp,sp,#16*4		@ alloca(X[16])
    233.Loop:
    234# if __ARM_ARCH__>=7
    235	ldr	$t1,[$inp],#4
    236# else
    237	ldrb	$t1,[$inp,#3]
    238# endif
    239	eor	$t3,$B,$C		@ magic
    240	eor	$t2,$t2,$t2
    241___
    242for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
    243$code.=".Lrounds_16_xx:\n";
    244for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
    245$code.=<<___;
    246#if __ARM_ARCH__>=7
    247	ite	eq			@ Thumb2 thing, sanity check in ARM
    248#endif
    249	ldreq	$t3,[sp,#16*4]		@ pull ctx
    250	bne	.Lrounds_16_xx
    251
    252	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
    253	ldr	$t0,[$t3,#0]
    254	ldr	$t1,[$t3,#4]
    255	ldr	$t2,[$t3,#8]
    256	add	$A,$A,$t0
    257	ldr	$t0,[$t3,#12]
    258	add	$B,$B,$t1
    259	ldr	$t1,[$t3,#16]
    260	add	$C,$C,$t2
    261	ldr	$t2,[$t3,#20]
    262	add	$D,$D,$t0
    263	ldr	$t0,[$t3,#24]
    264	add	$E,$E,$t1
    265	ldr	$t1,[$t3,#28]
    266	add	$F,$F,$t2
    267	ldr	$inp,[sp,#17*4]		@ pull inp
    268	ldr	$t2,[sp,#18*4]		@ pull inp+len
    269	add	$G,$G,$t0
    270	add	$H,$H,$t1
    271	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
    272	cmp	$inp,$t2
    273	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
    274	bne	.Loop
    275
    276	add	sp,sp,#`16+3`*4	@ destroy frame
    277#if __ARM_ARCH__>=5
    278	ldmia	sp!,{r4-r11,pc}
    279#else
    280	ldmia	sp!,{r4-r11,lr}
    281	tst	lr,#1
    282	moveq	pc,lr			@ be binary compatible with V4, yet
    283	bx	lr			@ interoperable with Thumb ISA:-)
    284#endif
    285.size	sha256_block_data_order,.-sha256_block_data_order
    286___
    287######################################################################
    288# NEON stuff
    289#
    290{{{
    291my @X=map("q$_",(0..3));
    292my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
    293my $Xfer=$t4;
    294my $j=0;
    295
    296sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
    297sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
    298
    299sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
    300{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
    301  my $arg = pop;
    302    $arg = "#$arg" if ($arg*1 eq $arg);
    303    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
    304}
    305
    306sub Xupdate()
    307{ use integer;
    308  my $body = shift;
    309  my @insns = (&$body,&$body,&$body,&$body);
    310  my ($a,$b,$c,$d,$e,$f,$g,$h);
    311
    312	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
    313	 eval(shift(@insns));
    314	 eval(shift(@insns));
    315	 eval(shift(@insns));
    316	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
    317	 eval(shift(@insns));
    318	 eval(shift(@insns));
    319	 eval(shift(@insns));
    320	&vshr_u32	($T2,$T0,$sigma0[0]);
    321	 eval(shift(@insns));
    322	 eval(shift(@insns));
    323	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
    324	 eval(shift(@insns));
    325	 eval(shift(@insns));
    326	&vshr_u32	($T1,$T0,$sigma0[2]);
    327	 eval(shift(@insns));
    328	 eval(shift(@insns));
    329	&vsli_32	($T2,$T0,32-$sigma0[0]);
    330	 eval(shift(@insns));
    331	 eval(shift(@insns));
    332	&vshr_u32	($T3,$T0,$sigma0[1]);
    333	 eval(shift(@insns));
    334	 eval(shift(@insns));
    335	&veor		($T1,$T1,$T2);
    336	 eval(shift(@insns));
    337	 eval(shift(@insns));
    338	&vsli_32	($T3,$T0,32-$sigma0[1]);
    339	 eval(shift(@insns));
    340	 eval(shift(@insns));
    341	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
    342	 eval(shift(@insns));
    343	 eval(shift(@insns));
    344	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
    345	 eval(shift(@insns));
    346	 eval(shift(@insns));
    347	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
    348	 eval(shift(@insns));
    349	 eval(shift(@insns));
    350	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
    351	 eval(shift(@insns));
    352	 eval(shift(@insns));
    353	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
    354	 eval(shift(@insns));
    355	 eval(shift(@insns));
    356	  &veor		($T5,$T5,$T4);
    357	 eval(shift(@insns));
    358	 eval(shift(@insns));
    359	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
    360	 eval(shift(@insns));
    361	 eval(shift(@insns));
    362	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
    363	 eval(shift(@insns));
    364	 eval(shift(@insns));
    365	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
    366	 eval(shift(@insns));
    367	 eval(shift(@insns));
    368	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
    369	 eval(shift(@insns));
    370	 eval(shift(@insns));
    371	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
    372	 eval(shift(@insns));
    373	 eval(shift(@insns));
    374	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
    375	 eval(shift(@insns));
    376	 eval(shift(@insns));
    377	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
    378	 eval(shift(@insns));
    379	 eval(shift(@insns));
    380	  &veor		($T5,$T5,$T4);
    381	 eval(shift(@insns));
    382	 eval(shift(@insns));
    383	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
    384	 eval(shift(@insns));
    385	 eval(shift(@insns));
    386	&vld1_32	("{$T0}","[$Ktbl,:128]!");
    387	 eval(shift(@insns));
    388	 eval(shift(@insns));
    389	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
    390	 eval(shift(@insns));
    391	 eval(shift(@insns));
    392	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
    393	 eval(shift(@insns));
    394	 eval(shift(@insns));
    395	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
    396	 eval(shift(@insns));
    397	 eval(shift(@insns));
    398	&vadd_i32	($T0,$T0,@X[0]);
    399	 while($#insns>=2) { eval(shift(@insns)); }
    400	&vst1_32	("{$T0}","[$Xfer,:128]!");
    401	 eval(shift(@insns));
    402	 eval(shift(@insns));
    403
    404	push(@X,shift(@X));		# "rotate" X[]
    405}
    406
    407sub Xpreload()
    408{ use integer;
    409  my $body = shift;
    410  my @insns = (&$body,&$body,&$body,&$body);
    411  my ($a,$b,$c,$d,$e,$f,$g,$h);
    412
    413	 eval(shift(@insns));
    414	 eval(shift(@insns));
    415	 eval(shift(@insns));
    416	 eval(shift(@insns));
    417	&vld1_32	("{$T0}","[$Ktbl,:128]!");
    418	 eval(shift(@insns));
    419	 eval(shift(@insns));
    420	 eval(shift(@insns));
    421	 eval(shift(@insns));
    422	&vrev32_8	(@X[0],@X[0]);
    423	 eval(shift(@insns));
    424	 eval(shift(@insns));
    425	 eval(shift(@insns));
    426	 eval(shift(@insns));
    427	&vadd_i32	($T0,$T0,@X[0]);
    428	 foreach (@insns) { eval; }	# remaining instructions
    429	&vst1_32	("{$T0}","[$Xfer,:128]!");
    430
    431	push(@X,shift(@X));		# "rotate" X[]
    432}
    433
    434sub body_00_15 () {
    435	(
    436	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
    437	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
    438	'&eor	($t1,$f,$g)',
    439	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
    440	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
    441	'&and	($t1,$t1,$e)',
    442	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
    443	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
    444	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
    445	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
    446	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
    447	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
    448	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
    449	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
    450	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
    451	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
    452	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
    453	'&add	($d,$d,$h)',			# d+=h
    454	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
    455	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
    456	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
    457	)
    458}
    459
    460$code.=<<___;
    461#if __ARM_MAX_ARCH__>=7
    462.arch	armv7-a
    463.fpu	neon
    464
    465.global	sha256_block_data_order_neon
    466.type	sha256_block_data_order_neon,%function
    467.align	4
    468sha256_block_data_order_neon:
    469.LNEON:
    470	stmdb	sp!,{r4-r12,lr}
    471
    472	sub	$H,sp,#16*4+16
    473	adr	$Ktbl,.Lsha256_block_data_order
    474	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
    475	bic	$H,$H,#15		@ align for 128-bit stores
    476	mov	$t2,sp
    477	mov	sp,$H			@ alloca
    478	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    479
    480	vld1.8		{@X[0]},[$inp]!
    481	vld1.8		{@X[1]},[$inp]!
    482	vld1.8		{@X[2]},[$inp]!
    483	vld1.8		{@X[3]},[$inp]!
    484	vld1.32		{$T0},[$Ktbl,:128]!
    485	vld1.32		{$T1},[$Ktbl,:128]!
    486	vld1.32		{$T2},[$Ktbl,:128]!
    487	vld1.32		{$T3},[$Ktbl,:128]!
    488	vrev32.8	@X[0],@X[0]		@ yes, even on
    489	str		$ctx,[sp,#64]
    490	vrev32.8	@X[1],@X[1]		@ big-endian
    491	str		$inp,[sp,#68]
    492	mov		$Xfer,sp
    493	vrev32.8	@X[2],@X[2]
    494	str		$len,[sp,#72]
    495	vrev32.8	@X[3],@X[3]
    496	str		$t2,[sp,#76]		@ save original sp
    497	vadd.i32	$T0,$T0,@X[0]
    498	vadd.i32	$T1,$T1,@X[1]
    499	vst1.32		{$T0},[$Xfer,:128]!
    500	vadd.i32	$T2,$T2,@X[2]
    501	vst1.32		{$T1},[$Xfer,:128]!
    502	vadd.i32	$T3,$T3,@X[3]
    503	vst1.32		{$T2},[$Xfer,:128]!
    504	vst1.32		{$T3},[$Xfer,:128]!
    505
    506	ldmia		$ctx,{$A-$H}
    507	sub		$Xfer,$Xfer,#64
    508	ldr		$t1,[sp,#0]
    509	eor		$t2,$t2,$t2
    510	eor		$t3,$B,$C
    511	b		.L_00_48
    512
    513.align	4
    514.L_00_48:
    515___
    516	&Xupdate(\&body_00_15);
    517	&Xupdate(\&body_00_15);
    518	&Xupdate(\&body_00_15);
    519	&Xupdate(\&body_00_15);
    520$code.=<<___;
    521	teq	$t1,#0				@ check for K256 terminator
    522	ldr	$t1,[sp,#0]
    523	sub	$Xfer,$Xfer,#64
    524	bne	.L_00_48
    525
    526	ldr		$inp,[sp,#68]
    527	ldr		$t0,[sp,#72]
    528	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
    529	teq		$inp,$t0
    530	it		eq
    531	subeq		$inp,$inp,#64		@ avoid SEGV
    532	vld1.8		{@X[0]},[$inp]!		@ load next input block
    533	vld1.8		{@X[1]},[$inp]!
    534	vld1.8		{@X[2]},[$inp]!
    535	vld1.8		{@X[3]},[$inp]!
    536	it		ne
    537	strne		$inp,[sp,#68]
    538	mov		$Xfer,sp
    539___
    540	&Xpreload(\&body_00_15);
    541	&Xpreload(\&body_00_15);
    542	&Xpreload(\&body_00_15);
    543	&Xpreload(\&body_00_15);
    544$code.=<<___;
    545	ldr	$t0,[$t1,#0]
    546	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
    547	ldr	$t2,[$t1,#4]
    548	ldr	$t3,[$t1,#8]
    549	ldr	$t4,[$t1,#12]
    550	add	$A,$A,$t0			@ accumulate
    551	ldr	$t0,[$t1,#16]
    552	add	$B,$B,$t2
    553	ldr	$t2,[$t1,#20]
    554	add	$C,$C,$t3
    555	ldr	$t3,[$t1,#24]
    556	add	$D,$D,$t4
    557	ldr	$t4,[$t1,#28]
    558	add	$E,$E,$t0
    559	str	$A,[$t1],#4
    560	add	$F,$F,$t2
    561	str	$B,[$t1],#4
    562	add	$G,$G,$t3
    563	str	$C,[$t1],#4
    564	add	$H,$H,$t4
    565	str	$D,[$t1],#4
    566	stmia	$t1,{$E-$H}
    567
    568	ittte	ne
    569	movne	$Xfer,sp
    570	ldrne	$t1,[sp,#0]
    571	eorne	$t2,$t2,$t2
    572	ldreq	sp,[sp,#76]			@ restore original sp
    573	itt	ne
    574	eorne	$t3,$B,$C
    575	bne	.L_00_48
    576
    577	ldmia	sp!,{r4-r12,pc}
    578.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
    579#endif
    580___
    581}}}
    582######################################################################
    583# ARMv8 stuff
    584#
    585{{{
    586my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
    587my @MSG=map("q$_",(8..11));
    588my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
    589my $Ktbl="r3";
    590
    591$code.=<<___;
    592#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    593
    594# ifdef __thumb2__
    595#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
    596# else
    597#  define INST(a,b,c,d)	.byte	a,b,c,d
    598# endif
    599
    600.type	sha256_block_data_order_armv8,%function
    601.align	5
    602sha256_block_data_order_armv8:
    603.LARMv8:
    604	vld1.32	{$ABCD,$EFGH},[$ctx]
    605# ifdef __thumb2__
    606	adr	$Ktbl,.LARMv8
    607	sub	$Ktbl,$Ktbl,#.LARMv8-K256
    608# else
    609	adrl	$Ktbl,K256
    610# endif
    611	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    612
    613.Loop_v8:
    614	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
    615	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
    616	vld1.32		{$W0},[$Ktbl]!
    617	vrev32.8	@MSG[0],@MSG[0]
    618	vrev32.8	@MSG[1],@MSG[1]
    619	vrev32.8	@MSG[2],@MSG[2]
    620	vrev32.8	@MSG[3],@MSG[3]
    621	vmov		$ABCD_SAVE,$ABCD	@ offload
    622	vmov		$EFGH_SAVE,$EFGH
    623	teq		$inp,$len
    624___
    625for($i=0;$i<12;$i++) {
    626$code.=<<___;
    627	vld1.32		{$W1},[$Ktbl]!
    628	vadd.i32	$W0,$W0,@MSG[0]
    629	sha256su0	@MSG[0],@MSG[1]
    630	vmov		$abcd,$ABCD
    631	sha256h		$ABCD,$EFGH,$W0
    632	sha256h2	$EFGH,$abcd,$W0
    633	sha256su1	@MSG[0],@MSG[2],@MSG[3]
    634___
    635	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    636}
    637$code.=<<___;
    638	vld1.32		{$W1},[$Ktbl]!
    639	vadd.i32	$W0,$W0,@MSG[0]
    640	vmov		$abcd,$ABCD
    641	sha256h		$ABCD,$EFGH,$W0
    642	sha256h2	$EFGH,$abcd,$W0
    643
    644	vld1.32		{$W0},[$Ktbl]!
    645	vadd.i32	$W1,$W1,@MSG[1]
    646	vmov		$abcd,$ABCD
    647	sha256h		$ABCD,$EFGH,$W1
    648	sha256h2	$EFGH,$abcd,$W1
    649
    650	vld1.32		{$W1},[$Ktbl]
    651	vadd.i32	$W0,$W0,@MSG[2]
    652	sub		$Ktbl,$Ktbl,#256-16	@ rewind
    653	vmov		$abcd,$ABCD
    654	sha256h		$ABCD,$EFGH,$W0
    655	sha256h2	$EFGH,$abcd,$W0
    656
    657	vadd.i32	$W1,$W1,@MSG[3]
    658	vmov		$abcd,$ABCD
    659	sha256h		$ABCD,$EFGH,$W1
    660	sha256h2	$EFGH,$abcd,$W1
    661
    662	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
    663	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
    664	it		ne
    665	bne		.Loop_v8
    666
    667	vst1.32		{$ABCD,$EFGH},[$ctx]
    668
    669	ret		@ bx lr
    670.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
    671#endif
    672___
    673}}}
    674$code.=<<___;
    675.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    676.align	2
    677#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    678.comm   OPENSSL_armcap_P,4,4
    679#endif
    680___
    681
    682open SELF,$0;
    683while(<SELF>) {
    684	next if (/^#!/);
    685	last if (!s/^#/@/ and !/^$/);
    686	print;
    687}
    688close SELF;
    689
    690{   my  %opcode = (
    691	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
    692	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
    693
    694    sub unsha256 {
    695	my ($mnemonic,$arg)=@_;
    696
    697	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
    698	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
    699					 |(($2&7)<<17)|(($2&8)<<4)
    700					 |(($3&7)<<1) |(($3&8)<<2);
    701	    # since ARMv7 instructions are always encoded little-endian.
    702	    # correct solution is to use .inst directive, but older
    703	    # assemblers don't implement it:-(
    704	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
    705			$word&0xff,($word>>8)&0xff,
    706			($word>>16)&0xff,($word>>24)&0xff,
    707			$mnemonic,$arg;
    708	}
    709    }
    710}
    711
    712foreach (split($/,$code)) {
    713
    714	s/\`([^\`]*)\`/eval $1/geo;
    715
    716	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
    717
    718	s/\bret\b/bx	lr/go		or
    719	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
    720
    721	print $_,"\n";
    722}
    723
    724close STDOUT; # enforce flush