cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ghash-ce-core.S (17529B)


      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
      4 *
      5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
      6 */
      7
      8#include <linux/linkage.h>
      9#include <asm/assembler.h>
     10
     11	SHASH		.req	v0
     12	SHASH2		.req	v1
     13	T1		.req	v2
     14	T2		.req	v3
     15	MASK		.req	v4
     16	XM		.req	v5
     17	XL		.req	v6
     18	XH		.req	v7
     19	IN1		.req	v7
     20
     21	k00_16		.req	v8
     22	k32_48		.req	v9
     23
     24	t3		.req	v10
     25	t4		.req	v11
     26	t5		.req	v12
     27	t6		.req	v13
     28	t7		.req	v14
     29	t8		.req	v15
     30	t9		.req	v16
     31
     32	perm1		.req	v17
     33	perm2		.req	v18
     34	perm3		.req	v19
     35
     36	sh1		.req	v20
     37	sh2		.req	v21
     38	sh3		.req	v22
     39	sh4		.req	v23
     40
     41	ss1		.req	v24
     42	ss2		.req	v25
     43	ss3		.req	v26
     44	ss4		.req	v27
     45
     46	XL2		.req	v8
     47	XM2		.req	v9
     48	XH2		.req	v10
     49	XL3		.req	v11
     50	XM3		.req	v12
     51	XH3		.req	v13
     52	TT3		.req	v14
     53	TT4		.req	v15
     54	HH		.req	v16
     55	HH3		.req	v17
     56	HH4		.req	v18
     57	HH34		.req	v19
     58
     59	.text
     60	.arch		armv8-a+crypto
     61
     62	.macro		__pmull_p64, rd, rn, rm
     63	pmull		\rd\().1q, \rn\().1d, \rm\().1d
     64	.endm
     65
     66	.macro		__pmull2_p64, rd, rn, rm
     67	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
     68	.endm
     69
     70	.macro		__pmull_p8, rq, ad, bd
     71	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
     72	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
     73	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
     74
     75	__pmull_p8_\bd	\rq, \ad
     76	.endm
     77
     78	.macro		__pmull2_p8, rq, ad, bd
     79	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
     80	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
     81	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
     82
     83	__pmull2_p8_\bd	\rq, \ad
     84	.endm
     85
     86	.macro		__pmull_p8_SHASH, rq, ad
     87	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
     88	.endm
     89
     90	.macro		__pmull_p8_SHASH2, rq, ad
     91	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
     92	.endm
     93
     94	.macro		__pmull2_p8_SHASH, rq, ad
     95	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
     96	.endm
     97
     98	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
     99	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
    100	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
    101	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
    102	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
    103	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
    104	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
    105	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
    106	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
    107
    108	eor		t3.16b, t3.16b, t4.16b			// L = E + F
    109	eor		t5.16b, t5.16b, t6.16b			// M = G + H
    110	eor		t7.16b, t7.16b, t8.16b			// N = I + J
    111
    112	uzp1		t4.2d, t3.2d, t5.2d
    113	uzp2		t3.2d, t3.2d, t5.2d
    114	uzp1		t6.2d, t7.2d, t9.2d
    115	uzp2		t7.2d, t7.2d, t9.2d
    116
    117	// t3 = (L) (P0 + P1) << 8
    118	// t5 = (M) (P2 + P3) << 16
    119	eor		t4.16b, t4.16b, t3.16b
    120	and		t3.16b, t3.16b, k32_48.16b
    121
    122	// t7 = (N) (P4 + P5) << 24
    123	// t9 = (K) (P6 + P7) << 32
    124	eor		t6.16b, t6.16b, t7.16b
    125	and		t7.16b, t7.16b, k00_16.16b
    126
    127	eor		t4.16b, t4.16b, t3.16b
    128	eor		t6.16b, t6.16b, t7.16b
    129
    130	zip2		t5.2d, t4.2d, t3.2d
    131	zip1		t3.2d, t4.2d, t3.2d
    132	zip2		t9.2d, t6.2d, t7.2d
    133	zip1		t7.2d, t6.2d, t7.2d
    134
    135	ext		t3.16b, t3.16b, t3.16b, #15
    136	ext		t5.16b, t5.16b, t5.16b, #14
    137	ext		t7.16b, t7.16b, t7.16b, #13
    138	ext		t9.16b, t9.16b, t9.16b, #12
    139
    140	eor		t3.16b, t3.16b, t5.16b
    141	eor		t7.16b, t7.16b, t9.16b
    142	eor		\rq\().16b, \rq\().16b, t3.16b
    143	eor		\rq\().16b, \rq\().16b, t7.16b
    144	.endm
    145
    146	.macro		__pmull_pre_p64
    147	add		x8, x3, #16
    148	ld1		{HH.2d-HH4.2d}, [x8]
    149
    150	trn1		SHASH2.2d, SHASH.2d, HH.2d
    151	trn2		T1.2d, SHASH.2d, HH.2d
    152	eor		SHASH2.16b, SHASH2.16b, T1.16b
    153
    154	trn1		HH34.2d, HH3.2d, HH4.2d
    155	trn2		T1.2d, HH3.2d, HH4.2d
    156	eor		HH34.16b, HH34.16b, T1.16b
    157
    158	movi		MASK.16b, #0xe1
    159	shl		MASK.2d, MASK.2d, #57
    160	.endm
    161
    162	.macro		__pmull_pre_p8
    163	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
    164	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
    165
    166	// k00_16 := 0x0000000000000000_000000000000ffff
    167	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
    168	movi		k32_48.2d, #0xffffffff
    169	mov		k32_48.h[2], k32_48.h[0]
    170	ushr		k00_16.2d, k32_48.2d, #32
    171
    172	// prepare the permutation vectors
    173	mov_q		x5, 0x080f0e0d0c0b0a09
    174	movi		T1.8b, #8
    175	dup		perm1.2d, x5
    176	eor		perm1.16b, perm1.16b, T1.16b
    177	ushr		perm2.2d, perm1.2d, #8
    178	ushr		perm3.2d, perm1.2d, #16
    179	ushr		T1.2d, perm1.2d, #24
    180	sli		perm2.2d, perm1.2d, #56
    181	sli		perm3.2d, perm1.2d, #48
    182	sli		T1.2d, perm1.2d, #40
    183
    184	// precompute loop invariants
    185	tbl		sh1.16b, {SHASH.16b}, perm1.16b
    186	tbl		sh2.16b, {SHASH.16b}, perm2.16b
    187	tbl		sh3.16b, {SHASH.16b}, perm3.16b
    188	tbl		sh4.16b, {SHASH.16b}, T1.16b
    189	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
    190	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
    191	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
    192	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
    193	.endm
    194
    195	//
    196	// PMULL (64x64->128) based reduction for CPUs that can do
    197	// it in a single instruction.
    198	//
    199	.macro		__pmull_reduce_p64
    200	pmull		T2.1q, XL.1d, MASK.1d
    201	eor		XM.16b, XM.16b, T1.16b
    202
    203	mov		XH.d[0], XM.d[1]
    204	mov		XM.d[1], XL.d[0]
    205
    206	eor		XL.16b, XM.16b, T2.16b
    207	ext		T2.16b, XL.16b, XL.16b, #8
    208	pmull		XL.1q, XL.1d, MASK.1d
    209	.endm
    210
    211	//
    212	// Alternative reduction for CPUs that lack support for the
    213	// 64x64->128 PMULL instruction
    214	//
    215	.macro		__pmull_reduce_p8
    216	eor		XM.16b, XM.16b, T1.16b
    217
    218	mov		XL.d[1], XM.d[0]
    219	mov		XH.d[0], XM.d[1]
    220
    221	shl		T1.2d, XL.2d, #57
    222	shl		T2.2d, XL.2d, #62
    223	eor		T2.16b, T2.16b, T1.16b
    224	shl		T1.2d, XL.2d, #63
    225	eor		T2.16b, T2.16b, T1.16b
    226	ext		T1.16b, XL.16b, XH.16b, #8
    227	eor		T2.16b, T2.16b, T1.16b
    228
    229	mov		XL.d[1], T2.d[0]
    230	mov		XH.d[0], T2.d[1]
    231
    232	ushr		T2.2d, XL.2d, #1
    233	eor		XH.16b, XH.16b, XL.16b
    234	eor		XL.16b, XL.16b, T2.16b
    235	ushr		T2.2d, T2.2d, #6
    236	ushr		XL.2d, XL.2d, #1
    237	.endm
    238
    239	.macro		__pmull_ghash, pn
    240	ld1		{SHASH.2d}, [x3]
    241	ld1		{XL.2d}, [x1]
    242
    243	__pmull_pre_\pn
    244
    245	/* do the head block first, if supplied */
    246	cbz		x4, 0f
    247	ld1		{T1.2d}, [x4]
    248	mov		x4, xzr
    249	b		3f
    250
    2510:	.ifc		\pn, p64
    252	tbnz		w0, #0, 2f		// skip until #blocks is a
    253	tbnz		w0, #1, 2f		// round multiple of 4
    254
    2551:	ld1		{XM3.16b-TT4.16b}, [x2], #64
    256
    257	sub		w0, w0, #4
    258
    259	rev64		T1.16b, XM3.16b
    260	rev64		T2.16b, XH3.16b
    261	rev64		TT4.16b, TT4.16b
    262	rev64		TT3.16b, TT3.16b
    263
    264	ext		IN1.16b, TT4.16b, TT4.16b, #8
    265	ext		XL3.16b, TT3.16b, TT3.16b, #8
    266
    267	eor		TT4.16b, TT4.16b, IN1.16b
    268	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
    269	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
    270	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
    271
    272	eor		TT3.16b, TT3.16b, XL3.16b
    273	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
    274	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
    275	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
    276
    277	ext		IN1.16b, T2.16b, T2.16b, #8
    278	eor		XL2.16b, XL2.16b, XL3.16b
    279	eor		XH2.16b, XH2.16b, XH3.16b
    280	eor		XM2.16b, XM2.16b, XM3.16b
    281
    282	eor		T2.16b, T2.16b, IN1.16b
    283	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
    284	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
    285	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
    286
    287	eor		XL2.16b, XL2.16b, XL3.16b
    288	eor		XH2.16b, XH2.16b, XH3.16b
    289	eor		XM2.16b, XM2.16b, XM3.16b
    290
    291	ext		IN1.16b, T1.16b, T1.16b, #8
    292	ext		TT3.16b, XL.16b, XL.16b, #8
    293	eor		XL.16b, XL.16b, IN1.16b
    294	eor		T1.16b, T1.16b, TT3.16b
    295
    296	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
    297	eor		T1.16b, T1.16b, XL.16b
    298	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
    299	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
    300
    301	eor		XL.16b, XL.16b, XL2.16b
    302	eor		XH.16b, XH.16b, XH2.16b
    303	eor		XM.16b, XM.16b, XM2.16b
    304
    305	eor		T2.16b, XL.16b, XH.16b
    306	ext		T1.16b, XL.16b, XH.16b, #8
    307	eor		XM.16b, XM.16b, T2.16b
    308
    309	__pmull_reduce_p64
    310
    311	eor		T2.16b, T2.16b, XH.16b
    312	eor		XL.16b, XL.16b, T2.16b
    313
    314	cbz		w0, 5f
    315	b		1b
    316	.endif
    317
    3182:	ld1		{T1.2d}, [x2], #16
    319	sub		w0, w0, #1
    320
    3213:	/* multiply XL by SHASH in GF(2^128) */
    322CPU_LE(	rev64		T1.16b, T1.16b	)
    323
    324	ext		T2.16b, XL.16b, XL.16b, #8
    325	ext		IN1.16b, T1.16b, T1.16b, #8
    326	eor		T1.16b, T1.16b, T2.16b
    327	eor		XL.16b, XL.16b, IN1.16b
    328
    329	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
    330	eor		T1.16b, T1.16b, XL.16b
    331	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
    332	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
    333
    3344:	eor		T2.16b, XL.16b, XH.16b
    335	ext		T1.16b, XL.16b, XH.16b, #8
    336	eor		XM.16b, XM.16b, T2.16b
    337
    338	__pmull_reduce_\pn
    339
    340	eor		T2.16b, T2.16b, XH.16b
    341	eor		XL.16b, XL.16b, T2.16b
    342
    343	cbnz		w0, 0b
    344
    3455:	st1		{XL.2d}, [x1]
    346	ret
    347	.endm
    348
    349	/*
    350	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
    351	 *			   struct ghash_key const *k, const char *head)
    352	 */
    353SYM_FUNC_START(pmull_ghash_update_p64)
    354	__pmull_ghash	p64
    355SYM_FUNC_END(pmull_ghash_update_p64)
    356
    357SYM_FUNC_START(pmull_ghash_update_p8)
    358	__pmull_ghash	p8
    359SYM_FUNC_END(pmull_ghash_update_p8)
    360
    361	KS0		.req	v8
    362	KS1		.req	v9
    363	KS2		.req	v10
    364	KS3		.req	v11
    365
    366	INP0		.req	v21
    367	INP1		.req	v22
    368	INP2		.req	v23
    369	INP3		.req	v24
    370
    371	K0		.req	v25
    372	K1		.req	v26
    373	K2		.req	v27
    374	K3		.req	v28
    375	K4		.req	v12
    376	K5		.req	v13
    377	K6		.req	v4
    378	K7		.req	v5
    379	K8		.req	v14
    380	K9		.req	v15
    381	KK		.req	v29
    382	KL		.req	v30
    383	KM		.req	v31
    384
    385	.macro		load_round_keys, rounds, rk, tmp
    386	add		\tmp, \rk, #64
    387	ld1		{K0.4s-K3.4s}, [\rk]
    388	ld1		{K4.4s-K5.4s}, [\tmp]
    389	add		\tmp, \rk, \rounds, lsl #4
    390	sub		\tmp, \tmp, #32
    391	ld1		{KK.4s-KM.4s}, [\tmp]
    392	.endm
    393
    394	.macro		enc_round, state, key
    395	aese		\state\().16b, \key\().16b
    396	aesmc		\state\().16b, \state\().16b
    397	.endm
    398
    399	.macro		enc_qround, s0, s1, s2, s3, key
    400	enc_round	\s0, \key
    401	enc_round	\s1, \key
    402	enc_round	\s2, \key
    403	enc_round	\s3, \key
    404	.endm
    405
    406	.macro		enc_block, state, rounds, rk, tmp
    407	add		\tmp, \rk, #96
    408	ld1		{K6.4s-K7.4s}, [\tmp], #32
    409	.irp		key, K0, K1, K2, K3, K4 K5
    410	enc_round	\state, \key
    411	.endr
    412
    413	tbnz		\rounds, #2, .Lnot128_\@
    414.Lout256_\@:
    415	enc_round	\state, K6
    416	enc_round	\state, K7
    417
    418.Lout192_\@:
    419	enc_round	\state, KK
    420	aese		\state\().16b, KL.16b
    421	eor		\state\().16b, \state\().16b, KM.16b
    422
    423	.subsection	1
    424.Lnot128_\@:
    425	ld1		{K8.4s-K9.4s}, [\tmp], #32
    426	enc_round	\state, K6
    427	enc_round	\state, K7
    428	ld1		{K6.4s-K7.4s}, [\tmp]
    429	enc_round	\state, K8
    430	enc_round	\state, K9
    431	tbz		\rounds, #1, .Lout192_\@
    432	b		.Lout256_\@
    433	.previous
    434	.endm
    435
    436	.align		6
    437	.macro		pmull_gcm_do_crypt, enc
    438	stp		x29, x30, [sp, #-32]!
    439	mov		x29, sp
    440	str		x19, [sp, #24]
    441
    442	load_round_keys	x7, x6, x8
    443
    444	ld1		{SHASH.2d}, [x3], #16
    445	ld1		{HH.2d-HH4.2d}, [x3]
    446
    447	trn1		SHASH2.2d, SHASH.2d, HH.2d
    448	trn2		T1.2d, SHASH.2d, HH.2d
    449	eor		SHASH2.16b, SHASH2.16b, T1.16b
    450
    451	trn1		HH34.2d, HH3.2d, HH4.2d
    452	trn2		T1.2d, HH3.2d, HH4.2d
    453	eor		HH34.16b, HH34.16b, T1.16b
    454
    455	ld1		{XL.2d}, [x4]
    456
    457	cbz		x0, 3f				// tag only?
    458
    459	ldr		w8, [x5, #12]			// load lower counter
    460CPU_LE(	rev		w8, w8		)
    461
    4620:	mov		w9, #4				// max blocks per round
    463	add		x10, x0, #0xf
    464	lsr		x10, x10, #4			// remaining blocks
    465
    466	subs		x0, x0, #64
    467	csel		w9, w10, w9, mi
    468	add		w8, w8, w9
    469
    470	bmi		1f
    471	ld1		{INP0.16b-INP3.16b}, [x2], #64
    472	.subsection	1
    473	/*
    474	 * Populate the four input registers right to left with up to 63 bytes
    475	 * of data, using overlapping loads to avoid branches.
    476	 *
    477	 *                INP0     INP1     INP2     INP3
    478	 *  1 byte     |        |        |        |x       |
    479	 * 16 bytes    |        |        |        |xxxxxxxx|
    480	 * 17 bytes    |        |        |xxxxxxxx|x       |
    481	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
    482	 * etc etc
    483	 *
    484	 * Note that this code may read up to 15 bytes before the start of
    485	 * the input. It is up to the calling code to ensure this is safe if
    486	 * this happens in the first iteration of the loop (i.e., when the
    487	 * input size is < 16 bytes)
    488	 */
    4891:	mov		x15, #16
    490	ands		x19, x0, #0xf
    491	csel		x19, x19, x15, ne
    492	adr_l		x17, .Lpermute_table + 16
    493
    494	sub		x11, x15, x19
    495	add		x12, x17, x11
    496	sub		x17, x17, x11
    497	ld1		{T1.16b}, [x12]
    498	sub		x10, x1, x11
    499	sub		x11, x2, x11
    500
    501	cmp		x0, #-16
    502	csel		x14, x15, xzr, gt
    503	cmp		x0, #-32
    504	csel		x15, x15, xzr, gt
    505	cmp		x0, #-48
    506	csel		x16, x19, xzr, gt
    507	csel		x1, x1, x10, gt
    508	csel		x2, x2, x11, gt
    509
    510	ld1		{INP0.16b}, [x2], x14
    511	ld1		{INP1.16b}, [x2], x15
    512	ld1		{INP2.16b}, [x2], x16
    513	ld1		{INP3.16b}, [x2]
    514	tbl		INP3.16b, {INP3.16b}, T1.16b
    515	b		2f
    516	.previous
    517
    5182:	.if		\enc == 0
    519	bl		pmull_gcm_ghash_4x
    520	.endif
    521
    522	bl		pmull_gcm_enc_4x
    523
    524	tbnz		x0, #63, 6f
    525	st1		{INP0.16b-INP3.16b}, [x1], #64
    526	.if		\enc == 1
    527	bl		pmull_gcm_ghash_4x
    528	.endif
    529	bne		0b
    530
    5313:	ldp		x19, x10, [sp, #24]
    532	cbz		x10, 5f				// output tag?
    533
    534	ld1		{INP3.16b}, [x10]		// load lengths[]
    535	mov		w9, #1
    536	bl		pmull_gcm_ghash_4x
    537
    538	mov		w11, #(0x1 << 24)		// BE '1U'
    539	ld1		{KS0.16b}, [x5]
    540	mov		KS0.s[3], w11
    541
    542	enc_block	KS0, x7, x6, x12
    543
    544	ext		XL.16b, XL.16b, XL.16b, #8
    545	rev64		XL.16b, XL.16b
    546	eor		XL.16b, XL.16b, KS0.16b
    547
    548	.if		\enc == 1
    549	st1		{XL.16b}, [x10]			// store tag
    550	.else
    551	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
    552	adr_l		x17, .Lpermute_table
    553	ld1		{KS0.16b}, [x11]		// load supplied tag
    554	add		x17, x17, x12
    555	ld1		{KS1.16b}, [x17]		// load permute vector
    556
    557	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
    558	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
    559	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
    560	sminv		b0, XL.16b			// signed minimum across XL
    561	smov		w0, v0.b[0]			// return b0
    562	.endif
    563
    5644:	ldp		x29, x30, [sp], #32
    565	ret
    566
    5675:
    568CPU_LE(	rev		w8, w8		)
    569	str		w8, [x5, #12]			// store lower counter
    570	st1		{XL.2d}, [x4]
    571	b		4b
    572
    5736:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
    574	sub		x17, x17, x19, lsl #1
    575
    576	cmp		w9, #1
    577	beq		7f
    578	.subsection	1
    5797:	ld1		{INP2.16b}, [x1]
    580	tbx		INP2.16b, {INP3.16b}, T1.16b
    581	mov		INP3.16b, INP2.16b
    582	b		8f
    583	.previous
    584
    585	st1		{INP0.16b}, [x1], x14
    586	st1		{INP1.16b}, [x1], x15
    587	st1		{INP2.16b}, [x1], x16
    588	tbl		INP3.16b, {INP3.16b}, T1.16b
    589	tbx		INP3.16b, {INP2.16b}, T2.16b
    5908:	st1		{INP3.16b}, [x1]
    591
    592	.if		\enc == 1
    593	ld1		{T1.16b}, [x17]
    594	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
    595	bl		pmull_gcm_ghash_4x
    596	.endif
    597	b		3b
    598	.endm
    599
    600	/*
    601	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
    602	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
    603	 *			  int rounds, u8 tag)
    604	 */
    605SYM_FUNC_START(pmull_gcm_encrypt)
    606	pmull_gcm_do_crypt	1
    607SYM_FUNC_END(pmull_gcm_encrypt)
    608
    609	/*
    610	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
    611	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
    612	 *			  int rounds, u8 tag)
    613	 */
    614SYM_FUNC_START(pmull_gcm_decrypt)
    615	pmull_gcm_do_crypt	0
    616SYM_FUNC_END(pmull_gcm_decrypt)
    617
    618SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
    619	movi		MASK.16b, #0xe1
    620	shl		MASK.2d, MASK.2d, #57
    621
    622	rev64		T1.16b, INP0.16b
    623	rev64		T2.16b, INP1.16b
    624	rev64		TT3.16b, INP2.16b
    625	rev64		TT4.16b, INP3.16b
    626
    627	ext		XL.16b, XL.16b, XL.16b, #8
    628
    629	tbz		w9, #2, 0f			// <4 blocks?
    630	.subsection	1
    6310:	movi		XH2.16b, #0
    632	movi		XM2.16b, #0
    633	movi		XL2.16b, #0
    634
    635	tbz		w9, #0, 1f			// 2 blocks?
    636	tbz		w9, #1, 2f			// 1 block?
    637
    638	eor		T2.16b, T2.16b, XL.16b
    639	ext		T1.16b, T2.16b, T2.16b, #8
    640	b		.Lgh3
    641
    6421:	eor		TT3.16b, TT3.16b, XL.16b
    643	ext		T2.16b, TT3.16b, TT3.16b, #8
    644	b		.Lgh2
    645
    6462:	eor		TT4.16b, TT4.16b, XL.16b
    647	ext		IN1.16b, TT4.16b, TT4.16b, #8
    648	b		.Lgh1
    649	.previous
    650
    651	eor		T1.16b, T1.16b, XL.16b
    652	ext		IN1.16b, T1.16b, T1.16b, #8
    653
    654	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
    655	eor		T1.16b, T1.16b, IN1.16b
    656	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
    657	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
    658
    659	ext		T1.16b, T2.16b, T2.16b, #8
    660.Lgh3:	eor		T2.16b, T2.16b, T1.16b
    661	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
    662	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
    663	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
    664
    665	eor		XH2.16b, XH2.16b, XH.16b
    666	eor		XL2.16b, XL2.16b, XL.16b
    667	eor		XM2.16b, XM2.16b, XM.16b
    668
    669	ext		T2.16b, TT3.16b, TT3.16b, #8
    670.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
    671	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
    672	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
    673	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
    674
    675	eor		XH2.16b, XH2.16b, XH.16b
    676	eor		XL2.16b, XL2.16b, XL.16b
    677	eor		XM2.16b, XM2.16b, XM.16b
    678
    679	ext		IN1.16b, TT4.16b, TT4.16b, #8
    680.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
    681	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
    682	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
    683	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
    684
    685	eor		XH.16b, XH.16b, XH2.16b
    686	eor		XL.16b, XL.16b, XL2.16b
    687	eor		XM.16b, XM.16b, XM2.16b
    688
    689	eor		T2.16b, XL.16b, XH.16b
    690	ext		T1.16b, XL.16b, XH.16b, #8
    691	eor		XM.16b, XM.16b, T2.16b
    692
    693	__pmull_reduce_p64
    694
    695	eor		T2.16b, T2.16b, XH.16b
    696	eor		XL.16b, XL.16b, T2.16b
    697
    698	ret
    699SYM_FUNC_END(pmull_gcm_ghash_4x)
    700
    701SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
    702	ld1		{KS0.16b}, [x5]			// load upper counter
    703	sub		w10, w8, #4
    704	sub		w11, w8, #3
    705	sub		w12, w8, #2
    706	sub		w13, w8, #1
    707	rev		w10, w10
    708	rev		w11, w11
    709	rev		w12, w12
    710	rev		w13, w13
    711	mov		KS1.16b, KS0.16b
    712	mov		KS2.16b, KS0.16b
    713	mov		KS3.16b, KS0.16b
    714	ins		KS0.s[3], w10			// set lower counter
    715	ins		KS1.s[3], w11
    716	ins		KS2.s[3], w12
    717	ins		KS3.s[3], w13
    718
    719	add		x10, x6, #96			// round key pointer
    720	ld1		{K6.4s-K7.4s}, [x10], #32
    721	.irp		key, K0, K1, K2, K3, K4, K5
    722	enc_qround	KS0, KS1, KS2, KS3, \key
    723	.endr
    724
    725	tbnz		x7, #2, .Lnot128
    726	.subsection	1
    727.Lnot128:
    728	ld1		{K8.4s-K9.4s}, [x10], #32
    729	.irp		key, K6, K7
    730	enc_qround	KS0, KS1, KS2, KS3, \key
    731	.endr
    732	ld1		{K6.4s-K7.4s}, [x10]
    733	.irp		key, K8, K9
    734	enc_qround	KS0, KS1, KS2, KS3, \key
    735	.endr
    736	tbz		x7, #1, .Lout192
    737	b		.Lout256
    738	.previous
    739
    740.Lout256:
    741	.irp		key, K6, K7
    742	enc_qround	KS0, KS1, KS2, KS3, \key
    743	.endr
    744
    745.Lout192:
    746	enc_qround	KS0, KS1, KS2, KS3, KK
    747
    748	aese		KS0.16b, KL.16b
    749	aese		KS1.16b, KL.16b
    750	aese		KS2.16b, KL.16b
    751	aese		KS3.16b, KL.16b
    752
    753	eor		KS0.16b, KS0.16b, KM.16b
    754	eor		KS1.16b, KS1.16b, KM.16b
    755	eor		KS2.16b, KS2.16b, KM.16b
    756	eor		KS3.16b, KS3.16b, KM.16b
    757
    758	eor		INP0.16b, INP0.16b, KS0.16b
    759	eor		INP1.16b, INP1.16b, KS1.16b
    760	eor		INP2.16b, INP2.16b, KS2.16b
    761	eor		INP3.16b, INP3.16b, KS3.16b
    762
    763	ret
    764SYM_FUNC_END(pmull_gcm_enc_4x)
    765
    766	.section	".rodata", "a"
    767	.align		6
    768.Lpermute_table:
    769	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    770	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    771	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
    772	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
    773	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    774	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    775	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
    776	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
    777	.previous