cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sha1-armv7-neon.S (20509B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
      3 *
      4 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
      5 */
      6
      7#include <linux/linkage.h>
      8#include <asm/assembler.h>
      9
     10.syntax unified
     11.fpu neon
     12
     13.text
     14
     15
     16/* Context structure */
     17
     18#define state_h0 0
     19#define state_h1 4
     20#define state_h2 8
     21#define state_h3 12
     22#define state_h4 16
     23
     24
     25/* Constants */
     26
     27#define K1  0x5A827999
     28#define K2  0x6ED9EBA1
     29#define K3  0x8F1BBCDC
     30#define K4  0xCA62C1D6
     31.align 4
     32.LK_VEC:
     33.LK1:	.long K1, K1, K1, K1
     34.LK2:	.long K2, K2, K2, K2
     35.LK3:	.long K3, K3, K3, K3
     36.LK4:	.long K4, K4, K4, K4
     37
     38
     39/* Register macros */
     40
     41#define RSTATE r0
     42#define RDATA r1
     43#define RNBLKS r2
     44#define ROLDSTACK r3
     45#define RWK lr
     46
     47#define _a r4
     48#define _b r5
     49#define _c r6
     50#define _d r7
     51#define _e r8
     52
     53#define RT0 r9
     54#define RT1 r10
     55#define RT2 r11
     56#define RT3 r12
     57
     58#define W0 q0
     59#define W1 q7
     60#define W2 q2
     61#define W3 q3
     62#define W4 q4
     63#define W5 q6
     64#define W6 q5
     65#define W7 q1
     66
     67#define tmp0 q8
     68#define tmp1 q9
     69#define tmp2 q10
     70#define tmp3 q11
     71
     72#define qK1 q12
     73#define qK2 q13
     74#define qK3 q14
     75#define qK4 q15
     76
     77#ifdef CONFIG_CPU_BIG_ENDIAN
     78#define ARM_LE(code...)
     79#else
     80#define ARM_LE(code...)		code
     81#endif
     82
     83/* Round function macros. */
     84
     85#define WK_offs(i) (((i) & 15) * 4)
     86
     87#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
     88	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
     89	ldr RT3, [sp, WK_offs(i)]; \
     90		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
     91	bic RT0, d, b; \
     92	add e, e, a, ror #(32 - 5); \
     93	and RT1, c, b; \
     94		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
     95	add RT0, RT0, RT3; \
     96	add e, e, RT1; \
     97	ror b, #(32 - 30); \
     98		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
     99	add e, e, RT0;
    100
    101#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
    102	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    103	ldr RT3, [sp, WK_offs(i)]; \
    104		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
    105	eor RT0, d, b; \
    106	add e, e, a, ror #(32 - 5); \
    107	eor RT0, RT0, c; \
    108		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
    109	add e, e, RT3; \
    110	ror b, #(32 - 30); \
    111		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
    112	add e, e, RT0; \
    113
    114#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
    115	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    116	ldr RT3, [sp, WK_offs(i)]; \
    117		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
    118	eor RT0, b, c; \
    119	and RT1, b, c; \
    120	add e, e, a, ror #(32 - 5); \
    121		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
    122	and RT0, RT0, d; \
    123	add RT1, RT1, RT3; \
    124	add e, e, RT0; \
    125	ror b, #(32 - 30); \
    126		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
    127	add e, e, RT1;
    128
    129#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
    130	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    131	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
    132	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
    133
    134#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
    135           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    136	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
    137	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
    138
    139#define R(a,b,c,d,e,f,i) \
    140	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
    141	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
    142
    143#define dummy(...)
    144
    145
    146/* Input expansion macros. */
    147
    148/********* Precalc macros for rounds 0-15 *************************************/
    149
    150#define W_PRECALC_00_15() \
    151	add       RWK, sp, #(WK_offs(0));			\
    152	\
    153	vld1.32   {W0, W7}, [RDATA]!;				\
    154 ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
    155	vld1.32   {W6, W5}, [RDATA]!;				\
    156	vadd.u32  tmp0, W0, curK;				\
    157 ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
    158 ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
    159	vadd.u32  tmp1, W7, curK;				\
    160 ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
    161	vadd.u32  tmp2, W6, curK;				\
    162	vst1.32   {tmp0, tmp1}, [RWK]!;				\
    163	vadd.u32  tmp3, W5, curK;				\
    164	vst1.32   {tmp2, tmp3}, [RWK];				\
    165
    166#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    167	vld1.32   {W0, W7}, [RDATA]!;				\
    168
    169#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    170	add       RWK, sp, #(WK_offs(0));			\
    171
    172#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    173 ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
    174
    175#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    176	vld1.32   {W6, W5}, [RDATA]!;				\
    177
    178#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    179	vadd.u32  tmp0, W0, curK;				\
    180
    181#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    182 ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
    183
    184#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    185 ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
    186
    187#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    188	vadd.u32  tmp1, W7, curK;				\
    189
    190#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    191 ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
    192
    193#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    194	vadd.u32  tmp2, W6, curK;				\
    195
    196#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    197	vst1.32   {tmp0, tmp1}, [RWK]!;				\
    198
    199#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    200	vadd.u32  tmp3, W5, curK;				\
    201
    202#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    203	vst1.32   {tmp2, tmp3}, [RWK];				\
    204
    205
    206/********* Precalc macros for rounds 16-31 ************************************/
    207
    208#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    209	veor      tmp0, tmp0;			\
    210	vext.8    W, W_m16, W_m12, #8;		\
    211
    212#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    213	add       RWK, sp, #(WK_offs(i));	\
    214	vext.8    tmp0, W_m04, tmp0, #4;	\
    215
    216#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    217	veor      tmp0, tmp0, W_m16;		\
    218	veor.32   W, W, W_m08;			\
    219
    220#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    221	veor      tmp1, tmp1;			\
    222	veor      W, W, tmp0;			\
    223
    224#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    225	vshl.u32  tmp0, W, #1;			\
    226
    227#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    228	vext.8    tmp1, tmp1, W, #(16-12);	\
    229	vshr.u32  W, W, #31;			\
    230
    231#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    232	vorr      tmp0, tmp0, W;		\
    233	vshr.u32  W, tmp1, #30;			\
    234
    235#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    236	vshl.u32  tmp1, tmp1, #2;		\
    237
    238#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    239	veor      tmp0, tmp0, W;		\
    240
    241#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    242	veor      W, tmp0, tmp1;		\
    243
    244#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    245	vadd.u32  tmp0, W, curK;		\
    246
    247#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    248	vst1.32   {tmp0}, [RWK];
    249
    250
    251/********* Precalc macros for rounds 32-79 ************************************/
    252
    253#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    254	veor W, W_m28; \
    255
    256#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    257	vext.8 tmp0, W_m08, W_m04, #8; \
    258
    259#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    260	veor W, W_m16; \
    261
    262#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    263	veor W, tmp0; \
    264
    265#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    266	add RWK, sp, #(WK_offs(i&~3)); \
    267
    268#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    269	vshl.u32 tmp1, W, #2; \
    270
    271#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    272	vshr.u32 tmp0, W, #30; \
    273
    274#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    275	vorr W, tmp0, tmp1; \
    276
    277#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    278	vadd.u32 tmp0, W, curK; \
    279
    280#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
    281	vst1.32 {tmp0}, [RWK];
    282
    283
    284/*
    285 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
    286 *
    287 * unsigned int
    288 * sha1_transform_neon (void *ctx, const unsigned char *data,
    289 *                      unsigned int nblks)
    290 */
    291.align 3
    292ENTRY(sha1_transform_neon)
    293  /* input:
    294   *	r0: ctx, CTX
    295   *	r1: data (64*nblks bytes)
    296   *	r2: nblks
    297   */
    298
    299  cmp RNBLKS, #0;
    300  beq .Ldo_nothing;
    301
    302  push {r4-r12, lr};
    303  /*vpush {q4-q7};*/
    304
    305  adr RT3, .LK_VEC;
    306
    307  mov ROLDSTACK, sp;
    308
    309  /* Align stack. */
    310  sub RT0, sp, #(16*4);
    311  and RT0, #(~(16-1));
    312  mov sp, RT0;
    313
    314  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
    315
    316  /* Get the values of the chaining variables. */
    317  ldm RSTATE, {_a-_e};
    318
    319  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
    320
    321#undef curK
    322#define curK qK1
    323  /* Precalc 0-15. */
    324  W_PRECALC_00_15();
    325
    326.Loop:
    327  /* Transform 0-15 + Precalc 16-31. */
    328  _R( _a, _b, _c, _d, _e, F1,  0,
    329      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
    330      W4, W5, W6, W7, W0, _, _, _ );
    331  _R( _e, _a, _b, _c, _d, F1,  1,
    332      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
    333      W4, W5, W6, W7, W0, _, _, _ );
    334  _R( _d, _e, _a, _b, _c, F1,  2,
    335      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
    336      W4, W5, W6, W7, W0, _, _, _ );
    337  _R( _c, _d, _e, _a, _b, F1,  3,
    338      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
    339      W4, W5, W6, W7, W0, _, _, _ );
    340
    341#undef curK
    342#define curK qK2
    343  _R( _b, _c, _d, _e, _a, F1,  4,
    344      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
    345      W3, W4, W5, W6, W7, _, _, _ );
    346  _R( _a, _b, _c, _d, _e, F1,  5,
    347      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
    348      W3, W4, W5, W6, W7, _, _, _ );
    349  _R( _e, _a, _b, _c, _d, F1,  6,
    350      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
    351      W3, W4, W5, W6, W7, _, _, _ );
    352  _R( _d, _e, _a, _b, _c, F1,  7,
    353      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
    354      W3, W4, W5, W6, W7, _, _, _ );
    355
    356  _R( _c, _d, _e, _a, _b, F1,  8,
    357      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
    358      W2, W3, W4, W5, W6, _, _, _ );
    359  _R( _b, _c, _d, _e, _a, F1,  9,
    360      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
    361      W2, W3, W4, W5, W6, _, _, _ );
    362  _R( _a, _b, _c, _d, _e, F1, 10,
    363      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
    364      W2, W3, W4, W5, W6, _, _, _ );
    365  _R( _e, _a, _b, _c, _d, F1, 11,
    366      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
    367      W2, W3, W4, W5, W6, _, _, _ );
    368
    369  _R( _d, _e, _a, _b, _c, F1, 12,
    370      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
    371      W1, W2, W3, W4, W5, _, _, _ );
    372  _R( _c, _d, _e, _a, _b, F1, 13,
    373      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
    374      W1, W2, W3, W4, W5, _, _, _ );
    375  _R( _b, _c, _d, _e, _a, F1, 14,
    376      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
    377      W1, W2, W3, W4, W5, _, _, _ );
    378  _R( _a, _b, _c, _d, _e, F1, 15,
    379      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
    380      W1, W2, W3, W4, W5, _, _, _ );
    381
    382  /* Transform 16-63 + Precalc 32-79. */
    383  _R( _e, _a, _b, _c, _d, F1, 16,
    384      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
    385      W0, W1, W2, W3, W4, W5, W6, W7);
    386  _R( _d, _e, _a, _b, _c, F1, 17,
    387      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
    388      W0, W1, W2, W3, W4, W5, W6, W7);
    389  _R( _c, _d, _e, _a, _b, F1, 18,
    390      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
    391      W0, W1, W2, W3, W4, W5, W6, W7);
    392  _R( _b, _c, _d, _e, _a, F1, 19,
    393      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
    394      W0, W1, W2, W3, W4, W5, W6, W7);
    395
    396  _R( _a, _b, _c, _d, _e, F2, 20,
    397      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
    398      W7, W0, W1, W2, W3, W4, W5, W6);
    399  _R( _e, _a, _b, _c, _d, F2, 21,
    400      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
    401      W7, W0, W1, W2, W3, W4, W5, W6);
    402  _R( _d, _e, _a, _b, _c, F2, 22,
    403      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
    404      W7, W0, W1, W2, W3, W4, W5, W6);
    405  _R( _c, _d, _e, _a, _b, F2, 23,
    406      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
    407      W7, W0, W1, W2, W3, W4, W5, W6);
    408
    409#undef curK
    410#define curK qK3
    411  _R( _b, _c, _d, _e, _a, F2, 24,
    412      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
    413      W6, W7, W0, W1, W2, W3, W4, W5);
    414  _R( _a, _b, _c, _d, _e, F2, 25,
    415      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
    416      W6, W7, W0, W1, W2, W3, W4, W5);
    417  _R( _e, _a, _b, _c, _d, F2, 26,
    418      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
    419      W6, W7, W0, W1, W2, W3, W4, W5);
    420  _R( _d, _e, _a, _b, _c, F2, 27,
    421      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
    422      W6, W7, W0, W1, W2, W3, W4, W5);
    423
    424  _R( _c, _d, _e, _a, _b, F2, 28,
    425      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
    426      W5, W6, W7, W0, W1, W2, W3, W4);
    427  _R( _b, _c, _d, _e, _a, F2, 29,
    428      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
    429      W5, W6, W7, W0, W1, W2, W3, W4);
    430  _R( _a, _b, _c, _d, _e, F2, 30,
    431      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
    432      W5, W6, W7, W0, W1, W2, W3, W4);
    433  _R( _e, _a, _b, _c, _d, F2, 31,
    434      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
    435      W5, W6, W7, W0, W1, W2, W3, W4);
    436
    437  _R( _d, _e, _a, _b, _c, F2, 32,
    438      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
    439      W4, W5, W6, W7, W0, W1, W2, W3);
    440  _R( _c, _d, _e, _a, _b, F2, 33,
    441      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
    442      W4, W5, W6, W7, W0, W1, W2, W3);
    443  _R( _b, _c, _d, _e, _a, F2, 34,
    444      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
    445      W4, W5, W6, W7, W0, W1, W2, W3);
    446  _R( _a, _b, _c, _d, _e, F2, 35,
    447      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
    448      W4, W5, W6, W7, W0, W1, W2, W3);
    449
    450  _R( _e, _a, _b, _c, _d, F2, 36,
    451      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
    452      W3, W4, W5, W6, W7, W0, W1, W2);
    453  _R( _d, _e, _a, _b, _c, F2, 37,
    454      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
    455      W3, W4, W5, W6, W7, W0, W1, W2);
    456  _R( _c, _d, _e, _a, _b, F2, 38,
    457      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
    458      W3, W4, W5, W6, W7, W0, W1, W2);
    459  _R( _b, _c, _d, _e, _a, F2, 39,
    460      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
    461      W3, W4, W5, W6, W7, W0, W1, W2);
    462
    463  _R( _a, _b, _c, _d, _e, F3, 40,
    464      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
    465      W2, W3, W4, W5, W6, W7, W0, W1);
    466  _R( _e, _a, _b, _c, _d, F3, 41,
    467      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
    468      W2, W3, W4, W5, W6, W7, W0, W1);
    469  _R( _d, _e, _a, _b, _c, F3, 42,
    470      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
    471      W2, W3, W4, W5, W6, W7, W0, W1);
    472  _R( _c, _d, _e, _a, _b, F3, 43,
    473      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
    474      W2, W3, W4, W5, W6, W7, W0, W1);
    475
    476#undef curK
    477#define curK qK4
    478  _R( _b, _c, _d, _e, _a, F3, 44,
    479      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
    480      W1, W2, W3, W4, W5, W6, W7, W0);
    481  _R( _a, _b, _c, _d, _e, F3, 45,
    482      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
    483      W1, W2, W3, W4, W5, W6, W7, W0);
    484  _R( _e, _a, _b, _c, _d, F3, 46,
    485      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
    486      W1, W2, W3, W4, W5, W6, W7, W0);
    487  _R( _d, _e, _a, _b, _c, F3, 47,
    488      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
    489      W1, W2, W3, W4, W5, W6, W7, W0);
    490
    491  _R( _c, _d, _e, _a, _b, F3, 48,
    492      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
    493      W0, W1, W2, W3, W4, W5, W6, W7);
    494  _R( _b, _c, _d, _e, _a, F3, 49,
    495      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
    496      W0, W1, W2, W3, W4, W5, W6, W7);
    497  _R( _a, _b, _c, _d, _e, F3, 50,
    498      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
    499      W0, W1, W2, W3, W4, W5, W6, W7);
    500  _R( _e, _a, _b, _c, _d, F3, 51,
    501      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
    502      W0, W1, W2, W3, W4, W5, W6, W7);
    503
    504  _R( _d, _e, _a, _b, _c, F3, 52,
    505      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
    506      W7, W0, W1, W2, W3, W4, W5, W6);
    507  _R( _c, _d, _e, _a, _b, F3, 53,
    508      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
    509      W7, W0, W1, W2, W3, W4, W5, W6);
    510  _R( _b, _c, _d, _e, _a, F3, 54,
    511      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
    512      W7, W0, W1, W2, W3, W4, W5, W6);
    513  _R( _a, _b, _c, _d, _e, F3, 55,
    514      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
    515      W7, W0, W1, W2, W3, W4, W5, W6);
    516
    517  _R( _e, _a, _b, _c, _d, F3, 56,
    518      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
    519      W6, W7, W0, W1, W2, W3, W4, W5);
    520  _R( _d, _e, _a, _b, _c, F3, 57,
    521      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
    522      W6, W7, W0, W1, W2, W3, W4, W5);
    523  _R( _c, _d, _e, _a, _b, F3, 58,
    524      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
    525      W6, W7, W0, W1, W2, W3, W4, W5);
    526  _R( _b, _c, _d, _e, _a, F3, 59,
    527      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
    528      W6, W7, W0, W1, W2, W3, W4, W5);
    529
    530  subs RNBLKS, #1;
    531
    532  _R( _a, _b, _c, _d, _e, F4, 60,
    533      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
    534      W5, W6, W7, W0, W1, W2, W3, W4);
    535  _R( _e, _a, _b, _c, _d, F4, 61,
    536      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
    537      W5, W6, W7, W0, W1, W2, W3, W4);
    538  _R( _d, _e, _a, _b, _c, F4, 62,
    539      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
    540      W5, W6, W7, W0, W1, W2, W3, W4);
    541  _R( _c, _d, _e, _a, _b, F4, 63,
    542      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
    543      W5, W6, W7, W0, W1, W2, W3, W4);
    544
    545  beq .Lend;
    546
    547  /* Transform 64-79 + Precalc 0-15 of next block. */
    548#undef curK
    549#define curK qK1
    550  _R( _b, _c, _d, _e, _a, F4, 64,
    551      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    552  _R( _a, _b, _c, _d, _e, F4, 65,
    553      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    554  _R( _e, _a, _b, _c, _d, F4, 66,
    555      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    556  _R( _d, _e, _a, _b, _c, F4, 67,
    557      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    558
    559  _R( _c, _d, _e, _a, _b, F4, 68,
    560      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
    561  _R( _b, _c, _d, _e, _a, F4, 69,
    562      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
    563  _R( _a, _b, _c, _d, _e, F4, 70,
    564      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    565  _R( _e, _a, _b, _c, _d, F4, 71,
    566      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    567
    568  _R( _d, _e, _a, _b, _c, F4, 72,
    569      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
    570  _R( _c, _d, _e, _a, _b, F4, 73,
    571      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
    572  _R( _b, _c, _d, _e, _a, F4, 74,
    573      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    574  _R( _a, _b, _c, _d, _e, F4, 75,
    575      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    576
    577  _R( _e, _a, _b, _c, _d, F4, 76,
    578      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    579  _R( _d, _e, _a, _b, _c, F4, 77,
    580      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    581  _R( _c, _d, _e, _a, _b, F4, 78,
    582      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
    583  _R( _b, _c, _d, _e, _a, F4, 79,
    584      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
    585
    586  /* Update the chaining variables. */
    587  ldm RSTATE, {RT0-RT3};
    588  add _a, RT0;
    589  ldr RT0, [RSTATE, #state_h4];
    590  add _b, RT1;
    591  add _c, RT2;
    592  add _d, RT3;
    593  add _e, RT0;
    594  stm RSTATE, {_a-_e};
    595
    596  b .Loop;
    597
    598.Lend:
    599  /* Transform 64-79 */
    600  R( _b, _c, _d, _e, _a, F4, 64 );
    601  R( _a, _b, _c, _d, _e, F4, 65 );
    602  R( _e, _a, _b, _c, _d, F4, 66 );
    603  R( _d, _e, _a, _b, _c, F4, 67 );
    604  R( _c, _d, _e, _a, _b, F4, 68 );
    605  R( _b, _c, _d, _e, _a, F4, 69 );
    606  R( _a, _b, _c, _d, _e, F4, 70 );
    607  R( _e, _a, _b, _c, _d, F4, 71 );
    608  R( _d, _e, _a, _b, _c, F4, 72 );
    609  R( _c, _d, _e, _a, _b, F4, 73 );
    610  R( _b, _c, _d, _e, _a, F4, 74 );
    611  R( _a, _b, _c, _d, _e, F4, 75 );
    612  R( _e, _a, _b, _c, _d, F4, 76 );
    613  R( _d, _e, _a, _b, _c, F4, 77 );
    614  R( _c, _d, _e, _a, _b, F4, 78 );
    615  R( _b, _c, _d, _e, _a, F4, 79 );
    616
    617  mov sp, ROLDSTACK;
    618
    619  /* Update the chaining variables. */
    620  ldm RSTATE, {RT0-RT3};
    621  add _a, RT0;
    622  ldr RT0, [RSTATE, #state_h4];
    623  add _b, RT1;
    624  add _c, RT2;
    625  add _d, RT3;
    626  /*vpop {q4-q7};*/
    627  add _e, RT0;
    628  stm RSTATE, {_a-_e};
    629
    630  pop {r4-r12, pc};
    631
    632.Ldo_nothing:
    633  bx lr
    634ENDPROC(sha1_transform_neon)