cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-s390.S (13354B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * Original implementation written by Andy Polyakov, @dot-asm.
      4 * This is an adaptation of the original code for kernel use.
      5 *
      6 * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
      7 */
      8
      9#include <linux/linkage.h>
     10#include <asm/nospec-insn.h>
     11#include <asm/vx-insn.h>
     12
     13#define SP	%r15
     14#define FRAME	(16 * 8 + 4 * 8)
     15
     16.data
     17.align	32
     18
     19.Lsigma:
     20.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
     21.long	1,0,0,0
     22.long	2,0,0,0
     23.long	3,0,0,0
     24.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap
     25
     26.long	0,1,2,3
     27.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
     28.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
     29.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
     30.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
     31
     32.previous
     33
     34	GEN_BR_THUNK %r14
     35
     36.text
     37
     38#############################################################################
     39# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
     40#		      counst u32 *key, const u32 *counter)
     41
     42#define	OUT		%r2
     43#define	INP		%r3
     44#define	LEN		%r4
     45#define	KEY		%r5
     46#define	COUNTER		%r6
     47
     48#define BEPERM		%v31
     49#define CTR		%v26
     50
     51#define K0		%v16
     52#define K1		%v17
     53#define K2		%v18
     54#define K3		%v19
     55
     56#define XA0		%v0
     57#define XA1		%v1
     58#define XA2		%v2
     59#define XA3		%v3
     60
     61#define XB0		%v4
     62#define XB1		%v5
     63#define XB2		%v6
     64#define XB3		%v7
     65
     66#define XC0		%v8
     67#define XC1		%v9
     68#define XC2		%v10
     69#define XC3		%v11
     70
     71#define XD0		%v12
     72#define XD1		%v13
     73#define XD2		%v14
     74#define XD3		%v15
     75
     76#define XT0		%v27
     77#define XT1		%v28
     78#define XT2		%v29
     79#define XT3		%v30
     80
     81ENTRY(chacha20_vx_4x)
     82	stmg	%r6,%r7,6*8(SP)
     83
     84	larl	%r7,.Lsigma
     85	lhi	%r0,10
     86	lhi	%r1,0
     87
     88	VL	K0,0,,%r7		# load sigma
     89	VL	K1,0,,KEY		# load key
     90	VL	K2,16,,KEY
     91	VL	K3,0,,COUNTER		# load counter
     92
     93	VL	BEPERM,0x40,,%r7
     94	VL	CTR,0x50,,%r7
     95
     96	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma
     97
     98	VREPF	XB0,K1,0		# smash the key
     99	VREPF	XB1,K1,1
    100	VREPF	XB2,K1,2
    101	VREPF	XB3,K1,3
    102
    103	VREPF	XD0,K3,0
    104	VREPF	XD1,K3,1
    105	VREPF	XD2,K3,2
    106	VREPF	XD3,K3,3
    107	VAF	XD0,XD0,CTR
    108
    109	VREPF	XC0,K2,0
    110	VREPF	XC1,K2,1
    111	VREPF	XC2,K2,2
    112	VREPF	XC3,K2,3
    113
    114.Loop_4x:
    115	VAF	XA0,XA0,XB0
    116	VX	XD0,XD0,XA0
    117	VERLLF	XD0,XD0,16
    118
    119	VAF	XA1,XA1,XB1
    120	VX	XD1,XD1,XA1
    121	VERLLF	XD1,XD1,16
    122
    123	VAF	XA2,XA2,XB2
    124	VX	XD2,XD2,XA2
    125	VERLLF	XD2,XD2,16
    126
    127	VAF	XA3,XA3,XB3
    128	VX	XD3,XD3,XA3
    129	VERLLF	XD3,XD3,16
    130
    131	VAF	XC0,XC0,XD0
    132	VX	XB0,XB0,XC0
    133	VERLLF	XB0,XB0,12
    134
    135	VAF	XC1,XC1,XD1
    136	VX	XB1,XB1,XC1
    137	VERLLF	XB1,XB1,12
    138
    139	VAF	XC2,XC2,XD2
    140	VX	XB2,XB2,XC2
    141	VERLLF	XB2,XB2,12
    142
    143	VAF	XC3,XC3,XD3
    144	VX	XB3,XB3,XC3
    145	VERLLF	XB3,XB3,12
    146
    147	VAF	XA0,XA0,XB0
    148	VX	XD0,XD0,XA0
    149	VERLLF	XD0,XD0,8
    150
    151	VAF	XA1,XA1,XB1
    152	VX	XD1,XD1,XA1
    153	VERLLF	XD1,XD1,8
    154
    155	VAF	XA2,XA2,XB2
    156	VX	XD2,XD2,XA2
    157	VERLLF	XD2,XD2,8
    158
    159	VAF	XA3,XA3,XB3
    160	VX	XD3,XD3,XA3
    161	VERLLF	XD3,XD3,8
    162
    163	VAF	XC0,XC0,XD0
    164	VX	XB0,XB0,XC0
    165	VERLLF	XB0,XB0,7
    166
    167	VAF	XC1,XC1,XD1
    168	VX	XB1,XB1,XC1
    169	VERLLF	XB1,XB1,7
    170
    171	VAF	XC2,XC2,XD2
    172	VX	XB2,XB2,XC2
    173	VERLLF	XB2,XB2,7
    174
    175	VAF	XC3,XC3,XD3
    176	VX	XB3,XB3,XC3
    177	VERLLF	XB3,XB3,7
    178
    179	VAF	XA0,XA0,XB1
    180	VX	XD3,XD3,XA0
    181	VERLLF	XD3,XD3,16
    182
    183	VAF	XA1,XA1,XB2
    184	VX	XD0,XD0,XA1
    185	VERLLF	XD0,XD0,16
    186
    187	VAF	XA2,XA2,XB3
    188	VX	XD1,XD1,XA2
    189	VERLLF	XD1,XD1,16
    190
    191	VAF	XA3,XA3,XB0
    192	VX	XD2,XD2,XA3
    193	VERLLF	XD2,XD2,16
    194
    195	VAF	XC2,XC2,XD3
    196	VX	XB1,XB1,XC2
    197	VERLLF	XB1,XB1,12
    198
    199	VAF	XC3,XC3,XD0
    200	VX	XB2,XB2,XC3
    201	VERLLF	XB2,XB2,12
    202
    203	VAF	XC0,XC0,XD1
    204	VX	XB3,XB3,XC0
    205	VERLLF	XB3,XB3,12
    206
    207	VAF	XC1,XC1,XD2
    208	VX	XB0,XB0,XC1
    209	VERLLF	XB0,XB0,12
    210
    211	VAF	XA0,XA0,XB1
    212	VX	XD3,XD3,XA0
    213	VERLLF	XD3,XD3,8
    214
    215	VAF	XA1,XA1,XB2
    216	VX	XD0,XD0,XA1
    217	VERLLF	XD0,XD0,8
    218
    219	VAF	XA2,XA2,XB3
    220	VX	XD1,XD1,XA2
    221	VERLLF	XD1,XD1,8
    222
    223	VAF	XA3,XA3,XB0
    224	VX	XD2,XD2,XA3
    225	VERLLF	XD2,XD2,8
    226
    227	VAF	XC2,XC2,XD3
    228	VX	XB1,XB1,XC2
    229	VERLLF	XB1,XB1,7
    230
    231	VAF	XC3,XC3,XD0
    232	VX	XB2,XB2,XC3
    233	VERLLF	XB2,XB2,7
    234
    235	VAF	XC0,XC0,XD1
    236	VX	XB3,XB3,XC0
    237	VERLLF	XB3,XB3,7
    238
    239	VAF	XC1,XC1,XD2
    240	VX	XB0,XB0,XC1
    241	VERLLF	XB0,XB0,7
    242	brct	%r0,.Loop_4x
    243
    244	VAF	XD0,XD0,CTR
    245
    246	VMRHF	XT0,XA0,XA1		# transpose data
    247	VMRHF	XT1,XA2,XA3
    248	VMRLF	XT2,XA0,XA1
    249	VMRLF	XT3,XA2,XA3
    250	VPDI	XA0,XT0,XT1,0b0000
    251	VPDI	XA1,XT0,XT1,0b0101
    252	VPDI	XA2,XT2,XT3,0b0000
    253	VPDI	XA3,XT2,XT3,0b0101
    254
    255	VMRHF	XT0,XB0,XB1
    256	VMRHF	XT1,XB2,XB3
    257	VMRLF	XT2,XB0,XB1
    258	VMRLF	XT3,XB2,XB3
    259	VPDI	XB0,XT0,XT1,0b0000
    260	VPDI	XB1,XT0,XT1,0b0101
    261	VPDI	XB2,XT2,XT3,0b0000
    262	VPDI	XB3,XT2,XT3,0b0101
    263
    264	VMRHF	XT0,XC0,XC1
    265	VMRHF	XT1,XC2,XC3
    266	VMRLF	XT2,XC0,XC1
    267	VMRLF	XT3,XC2,XC3
    268	VPDI	XC0,XT0,XT1,0b0000
    269	VPDI	XC1,XT0,XT1,0b0101
    270	VPDI	XC2,XT2,XT3,0b0000
    271	VPDI	XC3,XT2,XT3,0b0101
    272
    273	VMRHF	XT0,XD0,XD1
    274	VMRHF	XT1,XD2,XD3
    275	VMRLF	XT2,XD0,XD1
    276	VMRLF	XT3,XD2,XD3
    277	VPDI	XD0,XT0,XT1,0b0000
    278	VPDI	XD1,XT0,XT1,0b0101
    279	VPDI	XD2,XT2,XT3,0b0000
    280	VPDI	XD3,XT2,XT3,0b0101
    281
    282	VAF	XA0,XA0,K0
    283	VAF	XB0,XB0,K1
    284	VAF	XC0,XC0,K2
    285	VAF	XD0,XD0,K3
    286
    287	VPERM	XA0,XA0,XA0,BEPERM
    288	VPERM	XB0,XB0,XB0,BEPERM
    289	VPERM	XC0,XC0,XC0,BEPERM
    290	VPERM	XD0,XD0,XD0,BEPERM
    291
    292	VLM	XT0,XT3,0,INP,0
    293
    294	VX	XT0,XT0,XA0
    295	VX	XT1,XT1,XB0
    296	VX	XT2,XT2,XC0
    297	VX	XT3,XT3,XD0
    298
    299	VSTM	XT0,XT3,0,OUT,0
    300
    301	la	INP,0x40(INP)
    302	la	OUT,0x40(OUT)
    303	aghi	LEN,-0x40
    304
    305	VAF	XA0,XA1,K0
    306	VAF	XB0,XB1,K1
    307	VAF	XC0,XC1,K2
    308	VAF	XD0,XD1,K3
    309
    310	VPERM	XA0,XA0,XA0,BEPERM
    311	VPERM	XB0,XB0,XB0,BEPERM
    312	VPERM	XC0,XC0,XC0,BEPERM
    313	VPERM	XD0,XD0,XD0,BEPERM
    314
    315	clgfi	LEN,0x40
    316	jl	.Ltail_4x
    317
    318	VLM	XT0,XT3,0,INP,0
    319
    320	VX	XT0,XT0,XA0
    321	VX	XT1,XT1,XB0
    322	VX	XT2,XT2,XC0
    323	VX	XT3,XT3,XD0
    324
    325	VSTM	XT0,XT3,0,OUT,0
    326
    327	la	INP,0x40(INP)
    328	la	OUT,0x40(OUT)
    329	aghi	LEN,-0x40
    330	je	.Ldone_4x
    331
    332	VAF	XA0,XA2,K0
    333	VAF	XB0,XB2,K1
    334	VAF	XC0,XC2,K2
    335	VAF	XD0,XD2,K3
    336
    337	VPERM	XA0,XA0,XA0,BEPERM
    338	VPERM	XB0,XB0,XB0,BEPERM
    339	VPERM	XC0,XC0,XC0,BEPERM
    340	VPERM	XD0,XD0,XD0,BEPERM
    341
    342	clgfi	LEN,0x40
    343	jl	.Ltail_4x
    344
    345	VLM	XT0,XT3,0,INP,0
    346
    347	VX	XT0,XT0,XA0
    348	VX	XT1,XT1,XB0
    349	VX	XT2,XT2,XC0
    350	VX	XT3,XT3,XD0
    351
    352	VSTM	XT0,XT3,0,OUT,0
    353
    354	la	INP,0x40(INP)
    355	la	OUT,0x40(OUT)
    356	aghi	LEN,-0x40
    357	je	.Ldone_4x
    358
    359	VAF	XA0,XA3,K0
    360	VAF	XB0,XB3,K1
    361	VAF	XC0,XC3,K2
    362	VAF	XD0,XD3,K3
    363
    364	VPERM	XA0,XA0,XA0,BEPERM
    365	VPERM	XB0,XB0,XB0,BEPERM
    366	VPERM	XC0,XC0,XC0,BEPERM
    367	VPERM	XD0,XD0,XD0,BEPERM
    368
    369	clgfi	LEN,0x40
    370	jl	.Ltail_4x
    371
    372	VLM	XT0,XT3,0,INP,0
    373
    374	VX	XT0,XT0,XA0
    375	VX	XT1,XT1,XB0
    376	VX	XT2,XT2,XC0
    377	VX	XT3,XT3,XD0
    378
    379	VSTM	XT0,XT3,0,OUT,0
    380
    381.Ldone_4x:
    382	lmg	%r6,%r7,6*8(SP)
    383	BR_EX	%r14
    384
    385.Ltail_4x:
    386	VLR	XT0,XC0
    387	VLR	XT1,XD0
    388
    389	VST	XA0,8*8+0x00,,SP
    390	VST	XB0,8*8+0x10,,SP
    391	VST	XT0,8*8+0x20,,SP
    392	VST	XT1,8*8+0x30,,SP
    393
    394	lghi	%r1,0
    395
    396.Loop_tail_4x:
    397	llgc	%r5,0(%r1,INP)
    398	llgc	%r6,8*8(%r1,SP)
    399	xr	%r6,%r5
    400	stc	%r6,0(%r1,OUT)
    401	la	%r1,1(%r1)
    402	brct	LEN,.Loop_tail_4x
    403
    404	lmg	%r6,%r7,6*8(SP)
    405	BR_EX	%r14
    406ENDPROC(chacha20_vx_4x)
    407
    408#undef	OUT
    409#undef	INP
    410#undef	LEN
    411#undef	KEY
    412#undef	COUNTER
    413
    414#undef BEPERM
    415
    416#undef K0
    417#undef K1
    418#undef K2
    419#undef K3
    420
    421
    422#############################################################################
    423# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
    424#		   counst u32 *key, const u32 *counter)
    425
    426#define	OUT		%r2
    427#define	INP		%r3
    428#define	LEN		%r4
    429#define	KEY		%r5
    430#define	COUNTER		%r6
    431
    432#define BEPERM		%v31
    433
    434#define K0		%v27
    435#define K1		%v24
    436#define K2		%v25
    437#define K3		%v26
    438
    439#define A0		%v0
    440#define B0		%v1
    441#define C0		%v2
    442#define D0		%v3
    443
    444#define A1		%v4
    445#define B1		%v5
    446#define C1		%v6
    447#define D1		%v7
    448
    449#define A2		%v8
    450#define B2		%v9
    451#define C2		%v10
    452#define D2		%v11
    453
    454#define A3		%v12
    455#define B3		%v13
    456#define C3		%v14
    457#define D3		%v15
    458
    459#define A4		%v16
    460#define B4		%v17
    461#define C4		%v18
    462#define D4		%v19
    463
    464#define A5		%v20
    465#define B5		%v21
    466#define C5		%v22
    467#define D5		%v23
    468
    469#define T0		%v27
    470#define T1		%v28
    471#define T2		%v29
    472#define T3		%v30
    473
    474ENTRY(chacha20_vx)
    475	clgfi	LEN,256
    476	jle	chacha20_vx_4x
    477	stmg	%r6,%r7,6*8(SP)
    478
    479	lghi	%r1,-FRAME
    480	lgr	%r0,SP
    481	la	SP,0(%r1,SP)
    482	stg	%r0,0(SP)		# back-chain
    483
    484	larl	%r7,.Lsigma
    485	lhi	%r0,10
    486
    487	VLM	K1,K2,0,KEY,0		# load key
    488	VL	K3,0,,COUNTER		# load counter
    489
    490	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...
    491
    492.Loop_outer_vx:
    493	VLR	A0,K0
    494	VLR	B0,K1
    495	VLR	A1,K0
    496	VLR	B1,K1
    497	VLR	A2,K0
    498	VLR	B2,K1
    499	VLR	A3,K0
    500	VLR	B3,K1
    501	VLR	A4,K0
    502	VLR	B4,K1
    503	VLR	A5,K0
    504	VLR	B5,K1
    505
    506	VLR	D0,K3
    507	VAF	D1,K3,T1		# K[3]+1
    508	VAF	D2,K3,T2		# K[3]+2
    509	VAF	D3,K3,T3		# K[3]+3
    510	VAF	D4,D2,T2		# K[3]+4
    511	VAF	D5,D2,T3		# K[3]+5
    512
    513	VLR	C0,K2
    514	VLR	C1,K2
    515	VLR	C2,K2
    516	VLR	C3,K2
    517	VLR	C4,K2
    518	VLR	C5,K2
    519
    520	VLR	T1,D1
    521	VLR	T2,D2
    522	VLR	T3,D3
    523
    524.Loop_vx:
    525	VAF	A0,A0,B0
    526	VAF	A1,A1,B1
    527	VAF	A2,A2,B2
    528	VAF	A3,A3,B3
    529	VAF	A4,A4,B4
    530	VAF	A5,A5,B5
    531	VX	D0,D0,A0
    532	VX	D1,D1,A1
    533	VX	D2,D2,A2
    534	VX	D3,D3,A3
    535	VX	D4,D4,A4
    536	VX	D5,D5,A5
    537	VERLLF	D0,D0,16
    538	VERLLF	D1,D1,16
    539	VERLLF	D2,D2,16
    540	VERLLF	D3,D3,16
    541	VERLLF	D4,D4,16
    542	VERLLF	D5,D5,16
    543
    544	VAF	C0,C0,D0
    545	VAF	C1,C1,D1
    546	VAF	C2,C2,D2
    547	VAF	C3,C3,D3
    548	VAF	C4,C4,D4
    549	VAF	C5,C5,D5
    550	VX	B0,B0,C0
    551	VX	B1,B1,C1
    552	VX	B2,B2,C2
    553	VX	B3,B3,C3
    554	VX	B4,B4,C4
    555	VX	B5,B5,C5
    556	VERLLF	B0,B0,12
    557	VERLLF	B1,B1,12
    558	VERLLF	B2,B2,12
    559	VERLLF	B3,B3,12
    560	VERLLF	B4,B4,12
    561	VERLLF	B5,B5,12
    562
    563	VAF	A0,A0,B0
    564	VAF	A1,A1,B1
    565	VAF	A2,A2,B2
    566	VAF	A3,A3,B3
    567	VAF	A4,A4,B4
    568	VAF	A5,A5,B5
    569	VX	D0,D0,A0
    570	VX	D1,D1,A1
    571	VX	D2,D2,A2
    572	VX	D3,D3,A3
    573	VX	D4,D4,A4
    574	VX	D5,D5,A5
    575	VERLLF	D0,D0,8
    576	VERLLF	D1,D1,8
    577	VERLLF	D2,D2,8
    578	VERLLF	D3,D3,8
    579	VERLLF	D4,D4,8
    580	VERLLF	D5,D5,8
    581
    582	VAF	C0,C0,D0
    583	VAF	C1,C1,D1
    584	VAF	C2,C2,D2
    585	VAF	C3,C3,D3
    586	VAF	C4,C4,D4
    587	VAF	C5,C5,D5
    588	VX	B0,B0,C0
    589	VX	B1,B1,C1
    590	VX	B2,B2,C2
    591	VX	B3,B3,C3
    592	VX	B4,B4,C4
    593	VX	B5,B5,C5
    594	VERLLF	B0,B0,7
    595	VERLLF	B1,B1,7
    596	VERLLF	B2,B2,7
    597	VERLLF	B3,B3,7
    598	VERLLF	B4,B4,7
    599	VERLLF	B5,B5,7
    600
    601	VSLDB	C0,C0,C0,8
    602	VSLDB	C1,C1,C1,8
    603	VSLDB	C2,C2,C2,8
    604	VSLDB	C3,C3,C3,8
    605	VSLDB	C4,C4,C4,8
    606	VSLDB	C5,C5,C5,8
    607	VSLDB	B0,B0,B0,4
    608	VSLDB	B1,B1,B1,4
    609	VSLDB	B2,B2,B2,4
    610	VSLDB	B3,B3,B3,4
    611	VSLDB	B4,B4,B4,4
    612	VSLDB	B5,B5,B5,4
    613	VSLDB	D0,D0,D0,12
    614	VSLDB	D1,D1,D1,12
    615	VSLDB	D2,D2,D2,12
    616	VSLDB	D3,D3,D3,12
    617	VSLDB	D4,D4,D4,12
    618	VSLDB	D5,D5,D5,12
    619
    620	VAF	A0,A0,B0
    621	VAF	A1,A1,B1
    622	VAF	A2,A2,B2
    623	VAF	A3,A3,B3
    624	VAF	A4,A4,B4
    625	VAF	A5,A5,B5
    626	VX	D0,D0,A0
    627	VX	D1,D1,A1
    628	VX	D2,D2,A2
    629	VX	D3,D3,A3
    630	VX	D4,D4,A4
    631	VX	D5,D5,A5
    632	VERLLF	D0,D0,16
    633	VERLLF	D1,D1,16
    634	VERLLF	D2,D2,16
    635	VERLLF	D3,D3,16
    636	VERLLF	D4,D4,16
    637	VERLLF	D5,D5,16
    638
    639	VAF	C0,C0,D0
    640	VAF	C1,C1,D1
    641	VAF	C2,C2,D2
    642	VAF	C3,C3,D3
    643	VAF	C4,C4,D4
    644	VAF	C5,C5,D5
    645	VX	B0,B0,C0
    646	VX	B1,B1,C1
    647	VX	B2,B2,C2
    648	VX	B3,B3,C3
    649	VX	B4,B4,C4
    650	VX	B5,B5,C5
    651	VERLLF	B0,B0,12
    652	VERLLF	B1,B1,12
    653	VERLLF	B2,B2,12
    654	VERLLF	B3,B3,12
    655	VERLLF	B4,B4,12
    656	VERLLF	B5,B5,12
    657
    658	VAF	A0,A0,B0
    659	VAF	A1,A1,B1
    660	VAF	A2,A2,B2
    661	VAF	A3,A3,B3
    662	VAF	A4,A4,B4
    663	VAF	A5,A5,B5
    664	VX	D0,D0,A0
    665	VX	D1,D1,A1
    666	VX	D2,D2,A2
    667	VX	D3,D3,A3
    668	VX	D4,D4,A4
    669	VX	D5,D5,A5
    670	VERLLF	D0,D0,8
    671	VERLLF	D1,D1,8
    672	VERLLF	D2,D2,8
    673	VERLLF	D3,D3,8
    674	VERLLF	D4,D4,8
    675	VERLLF	D5,D5,8
    676
    677	VAF	C0,C0,D0
    678	VAF	C1,C1,D1
    679	VAF	C2,C2,D2
    680	VAF	C3,C3,D3
    681	VAF	C4,C4,D4
    682	VAF	C5,C5,D5
    683	VX	B0,B0,C0
    684	VX	B1,B1,C1
    685	VX	B2,B2,C2
    686	VX	B3,B3,C3
    687	VX	B4,B4,C4
    688	VX	B5,B5,C5
    689	VERLLF	B0,B0,7
    690	VERLLF	B1,B1,7
    691	VERLLF	B2,B2,7
    692	VERLLF	B3,B3,7
    693	VERLLF	B4,B4,7
    694	VERLLF	B5,B5,7
    695
    696	VSLDB	C0,C0,C0,8
    697	VSLDB	C1,C1,C1,8
    698	VSLDB	C2,C2,C2,8
    699	VSLDB	C3,C3,C3,8
    700	VSLDB	C4,C4,C4,8
    701	VSLDB	C5,C5,C5,8
    702	VSLDB	B0,B0,B0,12
    703	VSLDB	B1,B1,B1,12
    704	VSLDB	B2,B2,B2,12
    705	VSLDB	B3,B3,B3,12
    706	VSLDB	B4,B4,B4,12
    707	VSLDB	B5,B5,B5,12
    708	VSLDB	D0,D0,D0,4
    709	VSLDB	D1,D1,D1,4
    710	VSLDB	D2,D2,D2,4
    711	VSLDB	D3,D3,D3,4
    712	VSLDB	D4,D4,D4,4
    713	VSLDB	D5,D5,D5,4
    714	brct	%r0,.Loop_vx
    715
    716	VAF	A0,A0,K0
    717	VAF	B0,B0,K1
    718	VAF	C0,C0,K2
    719	VAF	D0,D0,K3
    720	VAF	A1,A1,K0
    721	VAF	D1,D1,T1		# +K[3]+1
    722
    723	VPERM	A0,A0,A0,BEPERM
    724	VPERM	B0,B0,B0,BEPERM
    725	VPERM	C0,C0,C0,BEPERM
    726	VPERM	D0,D0,D0,BEPERM
    727
    728	clgfi	LEN,0x40
    729	jl	.Ltail_vx
    730
    731	VAF	D2,D2,T2		# +K[3]+2
    732	VAF	D3,D3,T3		# +K[3]+3
    733	VLM	T0,T3,0,INP,0
    734
    735	VX	A0,A0,T0
    736	VX	B0,B0,T1
    737	VX	C0,C0,T2
    738	VX	D0,D0,T3
    739
    740	VLM	K0,T3,0,%r7,4		# re-load sigma and increments
    741
    742	VSTM	A0,D0,0,OUT,0
    743
    744	la	INP,0x40(INP)
    745	la	OUT,0x40(OUT)
    746	aghi	LEN,-0x40
    747	je	.Ldone_vx
    748
    749	VAF	B1,B1,K1
    750	VAF	C1,C1,K2
    751
    752	VPERM	A0,A1,A1,BEPERM
    753	VPERM	B0,B1,B1,BEPERM
    754	VPERM	C0,C1,C1,BEPERM
    755	VPERM	D0,D1,D1,BEPERM
    756
    757	clgfi	LEN,0x40
    758	jl	.Ltail_vx
    759
    760	VLM	A1,D1,0,INP,0
    761
    762	VX	A0,A0,A1
    763	VX	B0,B0,B1
    764	VX	C0,C0,C1
    765	VX	D0,D0,D1
    766
    767	VSTM	A0,D0,0,OUT,0
    768
    769	la	INP,0x40(INP)
    770	la	OUT,0x40(OUT)
    771	aghi	LEN,-0x40
    772	je	.Ldone_vx
    773
    774	VAF	A2,A2,K0
    775	VAF	B2,B2,K1
    776	VAF	C2,C2,K2
    777
    778	VPERM	A0,A2,A2,BEPERM
    779	VPERM	B0,B2,B2,BEPERM
    780	VPERM	C0,C2,C2,BEPERM
    781	VPERM	D0,D2,D2,BEPERM
    782
    783	clgfi	LEN,0x40
    784	jl	.Ltail_vx
    785
    786	VLM	A1,D1,0,INP,0
    787
    788	VX	A0,A0,A1
    789	VX	B0,B0,B1
    790	VX	C0,C0,C1
    791	VX	D0,D0,D1
    792
    793	VSTM	A0,D0,0,OUT,0
    794
    795	la	INP,0x40(INP)
    796	la	OUT,0x40(OUT)
    797	aghi	LEN,-0x40
    798	je	.Ldone_vx
    799
    800	VAF	A3,A3,K0
    801	VAF	B3,B3,K1
    802	VAF	C3,C3,K2
    803	VAF	D2,K3,T3		# K[3]+3
    804
    805	VPERM	A0,A3,A3,BEPERM
    806	VPERM	B0,B3,B3,BEPERM
    807	VPERM	C0,C3,C3,BEPERM
    808	VPERM	D0,D3,D3,BEPERM
    809
    810	clgfi	LEN,0x40
    811	jl	.Ltail_vx
    812
    813	VAF	D3,D2,T1		# K[3]+4
    814	VLM	A1,D1,0,INP,0
    815
    816	VX	A0,A0,A1
    817	VX	B0,B0,B1
    818	VX	C0,C0,C1
    819	VX	D0,D0,D1
    820
    821	VSTM	A0,D0,0,OUT,0
    822
    823	la	INP,0x40(INP)
    824	la	OUT,0x40(OUT)
    825	aghi	LEN,-0x40
    826	je	.Ldone_vx
    827
    828	VAF	A4,A4,K0
    829	VAF	B4,B4,K1
    830	VAF	C4,C4,K2
    831	VAF	D4,D4,D3		# +K[3]+4
    832	VAF	D3,D3,T1		# K[3]+5
    833	VAF	K3,D2,T3		# K[3]+=6
    834
    835	VPERM	A0,A4,A4,BEPERM
    836	VPERM	B0,B4,B4,BEPERM
    837	VPERM	C0,C4,C4,BEPERM
    838	VPERM	D0,D4,D4,BEPERM
    839
    840	clgfi	LEN,0x40
    841	jl	.Ltail_vx
    842
    843	VLM	A1,D1,0,INP,0
    844
    845	VX	A0,A0,A1
    846	VX	B0,B0,B1
    847	VX	C0,C0,C1
    848	VX	D0,D0,D1
    849
    850	VSTM	A0,D0,0,OUT,0
    851
    852	la	INP,0x40(INP)
    853	la	OUT,0x40(OUT)
    854	aghi	LEN,-0x40
    855	je	.Ldone_vx
    856
    857	VAF	A5,A5,K0
    858	VAF	B5,B5,K1
    859	VAF	C5,C5,K2
    860	VAF	D5,D5,D3		# +K[3]+5
    861
    862	VPERM	A0,A5,A5,BEPERM
    863	VPERM	B0,B5,B5,BEPERM
    864	VPERM	C0,C5,C5,BEPERM
    865	VPERM	D0,D5,D5,BEPERM
    866
    867	clgfi	LEN,0x40
    868	jl	.Ltail_vx
    869
    870	VLM	A1,D1,0,INP,0
    871
    872	VX	A0,A0,A1
    873	VX	B0,B0,B1
    874	VX	C0,C0,C1
    875	VX	D0,D0,D1
    876
    877	VSTM	A0,D0,0,OUT,0
    878
    879	la	INP,0x40(INP)
    880	la	OUT,0x40(OUT)
    881	lhi	%r0,10
    882	aghi	LEN,-0x40
    883	jne	.Loop_outer_vx
    884
    885.Ldone_vx:
    886	lmg	%r6,%r7,FRAME+6*8(SP)
    887	la	SP,FRAME(SP)
    888	BR_EX	%r14
    889
    890.Ltail_vx:
    891	VSTM	A0,D0,8*8,SP,3
    892	lghi	%r1,0
    893
    894.Loop_tail_vx:
    895	llgc	%r5,0(%r1,INP)
    896	llgc	%r6,8*8(%r1,SP)
    897	xr	%r6,%r5
    898	stc	%r6,0(%r1,OUT)
    899	la	%r1,1(%r1)
    900	brct	LEN,.Loop_tail_vx
    901
    902	lmg	%r6,%r7,FRAME+6*8(SP)
    903	la	SP,FRAME(SP)
    904	BR_EX	%r14
    905ENDPROC(chacha20_vx)
    906
    907.previous