cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aes-modes.S (17382B)


      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
      4 *
      5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
      6 */
      7
      8/* included by aes-ce.S and aes-neon.S */
      9
     10	.text
     11	.align		4
     12
     13#ifndef MAX_STRIDE
     14#define MAX_STRIDE	4
     15#endif
     16
     17#if MAX_STRIDE == 4
     18#define ST4(x...) x
     19#define ST5(x...)
     20#else
     21#define ST4(x...)
     22#define ST5(x...) x
     23#endif
     24
     25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
     26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
     27	ret
     28SYM_FUNC_END(aes_encrypt_block4x)
     29
     30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
     31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
     32	ret
     33SYM_FUNC_END(aes_decrypt_block4x)
     34
     35#if MAX_STRIDE == 5
     36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
     37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
     38	ret
     39SYM_FUNC_END(aes_encrypt_block5x)
     40
     41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
     42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
     43	ret
     44SYM_FUNC_END(aes_decrypt_block5x)
     45#endif
     46
     47	/*
     48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
     49	 *		   int blocks)
     50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
     51	 *		   int blocks)
     52	 */
     53
     54AES_FUNC_START(aes_ecb_encrypt)
     55	stp		x29, x30, [sp, #-16]!
     56	mov		x29, sp
     57
     58	enc_prepare	w3, x2, x5
     59
     60.LecbencloopNx:
     61	subs		w4, w4, #MAX_STRIDE
     62	bmi		.Lecbenc1x
     63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
     64ST4(	bl		aes_encrypt_block4x		)
     65ST5(	ld1		{v4.16b}, [x1], #16		)
     66ST5(	bl		aes_encrypt_block5x		)
     67	st1		{v0.16b-v3.16b}, [x0], #64
     68ST5(	st1		{v4.16b}, [x0], #16		)
     69	b		.LecbencloopNx
     70.Lecbenc1x:
     71	adds		w4, w4, #MAX_STRIDE
     72	beq		.Lecbencout
     73.Lecbencloop:
     74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
     75	encrypt_block	v0, w3, x2, x5, w6
     76	st1		{v0.16b}, [x0], #16
     77	subs		w4, w4, #1
     78	bne		.Lecbencloop
     79.Lecbencout:
     80	ldp		x29, x30, [sp], #16
     81	ret
     82AES_FUNC_END(aes_ecb_encrypt)
     83
     84
     85AES_FUNC_START(aes_ecb_decrypt)
     86	stp		x29, x30, [sp, #-16]!
     87	mov		x29, sp
     88
     89	dec_prepare	w3, x2, x5
     90
     91.LecbdecloopNx:
     92	subs		w4, w4, #MAX_STRIDE
     93	bmi		.Lecbdec1x
     94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
     95ST4(	bl		aes_decrypt_block4x		)
     96ST5(	ld1		{v4.16b}, [x1], #16		)
     97ST5(	bl		aes_decrypt_block5x		)
     98	st1		{v0.16b-v3.16b}, [x0], #64
     99ST5(	st1		{v4.16b}, [x0], #16		)
    100	b		.LecbdecloopNx
    101.Lecbdec1x:
    102	adds		w4, w4, #MAX_STRIDE
    103	beq		.Lecbdecout
    104.Lecbdecloop:
    105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
    106	decrypt_block	v0, w3, x2, x5, w6
    107	st1		{v0.16b}, [x0], #16
    108	subs		w4, w4, #1
    109	bne		.Lecbdecloop
    110.Lecbdecout:
    111	ldp		x29, x30, [sp], #16
    112	ret
    113AES_FUNC_END(aes_ecb_decrypt)
    114
    115
    116	/*
    117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    118	 *		   int blocks, u8 iv[])
    119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    120	 *		   int blocks, u8 iv[])
    121	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
    122	 *			 int rounds, int blocks, u8 iv[],
    123	 *			 u32 const rk2[]);
    124	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
    125	 *			 int rounds, int blocks, u8 iv[],
    126	 *			 u32 const rk2[]);
    127	 */
    128
    129AES_FUNC_START(aes_essiv_cbc_encrypt)
    130	ld1		{v4.16b}, [x5]			/* get iv */
    131
    132	mov		w8, #14				/* AES-256: 14 rounds */
    133	enc_prepare	w8, x6, x7
    134	encrypt_block	v4, w8, x6, x7, w9
    135	enc_switch_key	w3, x2, x6
    136	b		.Lcbcencloop4x
    137
    138AES_FUNC_START(aes_cbc_encrypt)
    139	ld1		{v4.16b}, [x5]			/* get iv */
    140	enc_prepare	w3, x2, x6
    141
    142.Lcbcencloop4x:
    143	subs		w4, w4, #4
    144	bmi		.Lcbcenc1x
    145	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
    146	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
    147	encrypt_block	v0, w3, x2, x6, w7
    148	eor		v1.16b, v1.16b, v0.16b
    149	encrypt_block	v1, w3, x2, x6, w7
    150	eor		v2.16b, v2.16b, v1.16b
    151	encrypt_block	v2, w3, x2, x6, w7
    152	eor		v3.16b, v3.16b, v2.16b
    153	encrypt_block	v3, w3, x2, x6, w7
    154	st1		{v0.16b-v3.16b}, [x0], #64
    155	mov		v4.16b, v3.16b
    156	b		.Lcbcencloop4x
    157.Lcbcenc1x:
    158	adds		w4, w4, #4
    159	beq		.Lcbcencout
    160.Lcbcencloop:
    161	ld1		{v0.16b}, [x1], #16		/* get next pt block */
    162	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
    163	encrypt_block	v4, w3, x2, x6, w7
    164	st1		{v4.16b}, [x0], #16
    165	subs		w4, w4, #1
    166	bne		.Lcbcencloop
    167.Lcbcencout:
    168	st1		{v4.16b}, [x5]			/* return iv */
    169	ret
    170AES_FUNC_END(aes_cbc_encrypt)
    171AES_FUNC_END(aes_essiv_cbc_encrypt)
    172
    173AES_FUNC_START(aes_essiv_cbc_decrypt)
    174	stp		x29, x30, [sp, #-16]!
    175	mov		x29, sp
    176
    177	ld1		{cbciv.16b}, [x5]		/* get iv */
    178
    179	mov		w8, #14				/* AES-256: 14 rounds */
    180	enc_prepare	w8, x6, x7
    181	encrypt_block	cbciv, w8, x6, x7, w9
    182	b		.Lessivcbcdecstart
    183
    184AES_FUNC_START(aes_cbc_decrypt)
    185	stp		x29, x30, [sp, #-16]!
    186	mov		x29, sp
    187
    188	ld1		{cbciv.16b}, [x5]		/* get iv */
    189.Lessivcbcdecstart:
    190	dec_prepare	w3, x2, x6
    191
    192.LcbcdecloopNx:
    193	subs		w4, w4, #MAX_STRIDE
    194	bmi		.Lcbcdec1x
    195	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
    196#if MAX_STRIDE == 5
    197	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
    198	mov		v5.16b, v0.16b
    199	mov		v6.16b, v1.16b
    200	mov		v7.16b, v2.16b
    201	bl		aes_decrypt_block5x
    202	sub		x1, x1, #32
    203	eor		v0.16b, v0.16b, cbciv.16b
    204	eor		v1.16b, v1.16b, v5.16b
    205	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
    206	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
    207	eor		v2.16b, v2.16b, v6.16b
    208	eor		v3.16b, v3.16b, v7.16b
    209	eor		v4.16b, v4.16b, v5.16b
    210#else
    211	mov		v4.16b, v0.16b
    212	mov		v5.16b, v1.16b
    213	mov		v6.16b, v2.16b
    214	bl		aes_decrypt_block4x
    215	sub		x1, x1, #16
    216	eor		v0.16b, v0.16b, cbciv.16b
    217	eor		v1.16b, v1.16b, v4.16b
    218	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
    219	eor		v2.16b, v2.16b, v5.16b
    220	eor		v3.16b, v3.16b, v6.16b
    221#endif
    222	st1		{v0.16b-v3.16b}, [x0], #64
    223ST5(	st1		{v4.16b}, [x0], #16		)
    224	b		.LcbcdecloopNx
    225.Lcbcdec1x:
    226	adds		w4, w4, #MAX_STRIDE
    227	beq		.Lcbcdecout
    228.Lcbcdecloop:
    229	ld1		{v1.16b}, [x1], #16		/* get next ct block */
    230	mov		v0.16b, v1.16b			/* ...and copy to v0 */
    231	decrypt_block	v0, w3, x2, x6, w7
    232	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
    233	mov		cbciv.16b, v1.16b		/* ct is next iv */
    234	st1		{v0.16b}, [x0], #16
    235	subs		w4, w4, #1
    236	bne		.Lcbcdecloop
    237.Lcbcdecout:
    238	st1		{cbciv.16b}, [x5]		/* return iv */
    239	ldp		x29, x30, [sp], #16
    240	ret
    241AES_FUNC_END(aes_cbc_decrypt)
    242AES_FUNC_END(aes_essiv_cbc_decrypt)
    243
    244
    245	/*
    246	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
    247	 *		       int rounds, int bytes, u8 const iv[])
    248	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
    249	 *		       int rounds, int bytes, u8 const iv[])
    250	 */
    251
    252AES_FUNC_START(aes_cbc_cts_encrypt)
    253	adr_l		x8, .Lcts_permute_table
    254	sub		x4, x4, #16
    255	add		x9, x8, #32
    256	add		x8, x8, x4
    257	sub		x9, x9, x4
    258	ld1		{v3.16b}, [x8]
    259	ld1		{v4.16b}, [x9]
    260
    261	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
    262	ld1		{v1.16b}, [x1]
    263
    264	ld1		{v5.16b}, [x5]			/* get iv */
    265	enc_prepare	w3, x2, x6
    266
    267	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
    268	tbl		v1.16b, {v1.16b}, v4.16b
    269	encrypt_block	v0, w3, x2, x6, w7
    270
    271	eor		v1.16b, v1.16b, v0.16b
    272	tbl		v0.16b, {v0.16b}, v3.16b
    273	encrypt_block	v1, w3, x2, x6, w7
    274
    275	add		x4, x0, x4
    276	st1		{v0.16b}, [x4]			/* overlapping stores */
    277	st1		{v1.16b}, [x0]
    278	ret
    279AES_FUNC_END(aes_cbc_cts_encrypt)
    280
    281AES_FUNC_START(aes_cbc_cts_decrypt)
    282	adr_l		x8, .Lcts_permute_table
    283	sub		x4, x4, #16
    284	add		x9, x8, #32
    285	add		x8, x8, x4
    286	sub		x9, x9, x4
    287	ld1		{v3.16b}, [x8]
    288	ld1		{v4.16b}, [x9]
    289
    290	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
    291	ld1		{v1.16b}, [x1]
    292
    293	ld1		{v5.16b}, [x5]			/* get iv */
    294	dec_prepare	w3, x2, x6
    295
    296	decrypt_block	v0, w3, x2, x6, w7
    297	tbl		v2.16b, {v0.16b}, v3.16b
    298	eor		v2.16b, v2.16b, v1.16b
    299
    300	tbx		v0.16b, {v1.16b}, v4.16b
    301	decrypt_block	v0, w3, x2, x6, w7
    302	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
    303
    304	add		x4, x0, x4
    305	st1		{v2.16b}, [x4]			/* overlapping stores */
    306	st1		{v0.16b}, [x0]
    307	ret
    308AES_FUNC_END(aes_cbc_cts_decrypt)
    309
    310	.section	".rodata", "a"
    311	.align		6
    312.Lcts_permute_table:
    313	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    314	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    315	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
    316	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
    317	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    318	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    319	.previous
    320
    321
    322	/*
    323	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    324	 *		   int bytes, u8 ctr[])
    325	 */
    326
    327AES_FUNC_START(aes_ctr_encrypt)
    328	stp		x29, x30, [sp, #-16]!
    329	mov		x29, sp
    330
    331	enc_prepare	w3, x2, x12
    332	ld1		{vctr.16b}, [x5]
    333
    334	umov		x12, vctr.d[1]		/* keep swabbed ctr in reg */
    335	rev		x12, x12
    336
    337.LctrloopNx:
    338	add		w7, w4, #15
    339	sub		w4, w4, #MAX_STRIDE << 4
    340	lsr		w7, w7, #4
    341	mov		w8, #MAX_STRIDE
    342	cmp		w7, w8
    343	csel		w7, w7, w8, lt
    344	adds		x12, x12, x7
    345
    346	mov		v0.16b, vctr.16b
    347	mov		v1.16b, vctr.16b
    348	mov		v2.16b, vctr.16b
    349	mov		v3.16b, vctr.16b
    350ST5(	mov		v4.16b, vctr.16b		)
    351	bcs		0f
    352
    353	.subsection	1
    354	/* apply carry to outgoing counter */
    3550:	umov		x8, vctr.d[0]
    356	rev		x8, x8
    357	add		x8, x8, #1
    358	rev		x8, x8
    359	ins		vctr.d[0], x8
    360
    361	/* apply carry to N counter blocks for N := x12 */
    362	cbz		x12, 2f
    363	adr		x16, 1f
    364	sub		x16, x16, x12, lsl #3
    365	br		x16
    366	bti		c
    367	mov		v0.d[0], vctr.d[0]
    368	bti		c
    369	mov		v1.d[0], vctr.d[0]
    370	bti		c
    371	mov		v2.d[0], vctr.d[0]
    372	bti		c
    373	mov		v3.d[0], vctr.d[0]
    374ST5(	bti		c				)
    375ST5(	mov		v4.d[0], vctr.d[0]		)
    3761:	b		2f
    377	.previous
    378
    3792:	rev		x7, x12
    380	ins		vctr.d[1], x7
    381	sub		x7, x12, #MAX_STRIDE - 1
    382	sub		x8, x12, #MAX_STRIDE - 2
    383	sub		x9, x12, #MAX_STRIDE - 3
    384	rev		x7, x7
    385	rev		x8, x8
    386	mov		v1.d[1], x7
    387	rev		x9, x9
    388ST5(	sub		x10, x12, #MAX_STRIDE - 4	)
    389	mov		v2.d[1], x8
    390ST5(	rev		x10, x10			)
    391	mov		v3.d[1], x9
    392ST5(	mov		v4.d[1], x10			)
    393	tbnz		w4, #31, .Lctrtail
    394	ld1		{v5.16b-v7.16b}, [x1], #48
    395ST4(	bl		aes_encrypt_block4x		)
    396ST5(	bl		aes_encrypt_block5x		)
    397	eor		v0.16b, v5.16b, v0.16b
    398ST4(	ld1		{v5.16b}, [x1], #16		)
    399	eor		v1.16b, v6.16b, v1.16b
    400ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
    401	eor		v2.16b, v7.16b, v2.16b
    402	eor		v3.16b, v5.16b, v3.16b
    403ST5(	eor		v4.16b, v6.16b, v4.16b		)
    404	st1		{v0.16b-v3.16b}, [x0], #64
    405ST5(	st1		{v4.16b}, [x0], #16		)
    406	cbz		w4, .Lctrout
    407	b		.LctrloopNx
    408
    409.Lctrout:
    410	st1		{vctr.16b}, [x5]	/* return next CTR value */
    411	ldp		x29, x30, [sp], #16
    412	ret
    413
    414.Lctrtail:
    415	/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
    416	mov		x16, #16
    417	ands		x6, x4, #0xf
    418	csel		x13, x6, x16, ne
    419
    420ST5(	cmp		w4, #64 - (MAX_STRIDE << 4)	)
    421ST5(	csel		x14, x16, xzr, gt		)
    422	cmp		w4, #48 - (MAX_STRIDE << 4)
    423	csel		x15, x16, xzr, gt
    424	cmp		w4, #32 - (MAX_STRIDE << 4)
    425	csel		x16, x16, xzr, gt
    426	cmp		w4, #16 - (MAX_STRIDE << 4)
    427
    428	adr_l		x12, .Lcts_permute_table
    429	add		x12, x12, x13
    430	ble		.Lctrtail1x
    431
    432ST5(	ld1		{v5.16b}, [x1], x14		)
    433	ld1		{v6.16b}, [x1], x15
    434	ld1		{v7.16b}, [x1], x16
    435
    436ST4(	bl		aes_encrypt_block4x		)
    437ST5(	bl		aes_encrypt_block5x		)
    438
    439	ld1		{v8.16b}, [x1], x13
    440	ld1		{v9.16b}, [x1]
    441	ld1		{v10.16b}, [x12]
    442
    443ST4(	eor		v6.16b, v6.16b, v0.16b		)
    444ST4(	eor		v7.16b, v7.16b, v1.16b		)
    445ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
    446ST4(	eor		v8.16b, v8.16b, v2.16b		)
    447ST4(	eor		v9.16b, v9.16b, v3.16b		)
    448
    449ST5(	eor		v5.16b, v5.16b, v0.16b		)
    450ST5(	eor		v6.16b, v6.16b, v1.16b		)
    451ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
    452ST5(	eor		v7.16b, v7.16b, v2.16b		)
    453ST5(	eor		v8.16b, v8.16b, v3.16b		)
    454ST5(	eor		v9.16b, v9.16b, v4.16b		)
    455
    456ST5(	st1		{v5.16b}, [x0], x14		)
    457	st1		{v6.16b}, [x0], x15
    458	st1		{v7.16b}, [x0], x16
    459	add		x13, x13, x0
    460	st1		{v9.16b}, [x13]		// overlapping stores
    461	st1		{v8.16b}, [x0]
    462	b		.Lctrout
    463
    464.Lctrtail1x:
    465	sub		x7, x6, #16
    466	csel		x6, x6, x7, eq
    467	add		x1, x1, x6
    468	add		x0, x0, x6
    469	ld1		{v5.16b}, [x1]
    470	ld1		{v6.16b}, [x0]
    471ST5(	mov		v3.16b, v4.16b			)
    472	encrypt_block	v3, w3, x2, x8, w7
    473	ld1		{v10.16b-v11.16b}, [x12]
    474	tbl		v3.16b, {v3.16b}, v10.16b
    475	sshr		v11.16b, v11.16b, #7
    476	eor		v5.16b, v5.16b, v3.16b
    477	bif		v5.16b, v6.16b, v11.16b
    478	st1		{v5.16b}, [x0]
    479	b		.Lctrout
    480AES_FUNC_END(aes_ctr_encrypt)
    481
    482
    483	/*
    484	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
    485	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
    486	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
    487	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
    488	 */
    489
    490	.macro		next_tweak, out, in, tmp
    491	sshr		\tmp\().2d,  \in\().2d,   #63
    492	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
    493	add		\out\().2d,  \in\().2d,   \in\().2d
    494	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
    495	eor		\out\().16b, \out\().16b, \tmp\().16b
    496	.endm
    497
    498	.macro		xts_load_mask, tmp
    499	movi		xtsmask.2s, #0x1
    500	movi		\tmp\().2s, #0x87
    501	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
    502	.endm
    503
    504AES_FUNC_START(aes_xts_encrypt)
    505	stp		x29, x30, [sp, #-16]!
    506	mov		x29, sp
    507
    508	ld1		{v4.16b}, [x6]
    509	xts_load_mask	v8
    510	cbz		w7, .Lxtsencnotfirst
    511
    512	enc_prepare	w3, x5, x8
    513	xts_cts_skip_tw	w7, .LxtsencNx
    514	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
    515	enc_switch_key	w3, x2, x8
    516	b		.LxtsencNx
    517
    518.Lxtsencnotfirst:
    519	enc_prepare	w3, x2, x8
    520.LxtsencloopNx:
    521	next_tweak	v4, v4, v8
    522.LxtsencNx:
    523	subs		w4, w4, #64
    524	bmi		.Lxtsenc1x
    525	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
    526	next_tweak	v5, v4, v8
    527	eor		v0.16b, v0.16b, v4.16b
    528	next_tweak	v6, v5, v8
    529	eor		v1.16b, v1.16b, v5.16b
    530	eor		v2.16b, v2.16b, v6.16b
    531	next_tweak	v7, v6, v8
    532	eor		v3.16b, v3.16b, v7.16b
    533	bl		aes_encrypt_block4x
    534	eor		v3.16b, v3.16b, v7.16b
    535	eor		v0.16b, v0.16b, v4.16b
    536	eor		v1.16b, v1.16b, v5.16b
    537	eor		v2.16b, v2.16b, v6.16b
    538	st1		{v0.16b-v3.16b}, [x0], #64
    539	mov		v4.16b, v7.16b
    540	cbz		w4, .Lxtsencret
    541	xts_reload_mask	v8
    542	b		.LxtsencloopNx
    543.Lxtsenc1x:
    544	adds		w4, w4, #64
    545	beq		.Lxtsencout
    546	subs		w4, w4, #16
    547	bmi		.LxtsencctsNx
    548.Lxtsencloop:
    549	ld1		{v0.16b}, [x1], #16
    550.Lxtsencctsout:
    551	eor		v0.16b, v0.16b, v4.16b
    552	encrypt_block	v0, w3, x2, x8, w7
    553	eor		v0.16b, v0.16b, v4.16b
    554	cbz		w4, .Lxtsencout
    555	subs		w4, w4, #16
    556	next_tweak	v4, v4, v8
    557	bmi		.Lxtsenccts
    558	st1		{v0.16b}, [x0], #16
    559	b		.Lxtsencloop
    560.Lxtsencout:
    561	st1		{v0.16b}, [x0]
    562.Lxtsencret:
    563	st1		{v4.16b}, [x6]
    564	ldp		x29, x30, [sp], #16
    565	ret
    566
    567.LxtsencctsNx:
    568	mov		v0.16b, v3.16b
    569	sub		x0, x0, #16
    570.Lxtsenccts:
    571	adr_l		x8, .Lcts_permute_table
    572
    573	add		x1, x1, w4, sxtw	/* rewind input pointer */
    574	add		w4, w4, #16		/* # bytes in final block */
    575	add		x9, x8, #32
    576	add		x8, x8, x4
    577	sub		x9, x9, x4
    578	add		x4, x0, x4		/* output address of final block */
    579
    580	ld1		{v1.16b}, [x1]		/* load final block */
    581	ld1		{v2.16b}, [x8]
    582	ld1		{v3.16b}, [x9]
    583
    584	tbl		v2.16b, {v0.16b}, v2.16b
    585	tbx		v0.16b, {v1.16b}, v3.16b
    586	st1		{v2.16b}, [x4]			/* overlapping stores */
    587	mov		w4, wzr
    588	b		.Lxtsencctsout
    589AES_FUNC_END(aes_xts_encrypt)
    590
    591AES_FUNC_START(aes_xts_decrypt)
    592	stp		x29, x30, [sp, #-16]!
    593	mov		x29, sp
    594
    595	/* subtract 16 bytes if we are doing CTS */
    596	sub		w8, w4, #0x10
    597	tst		w4, #0xf
    598	csel		w4, w4, w8, eq
    599
    600	ld1		{v4.16b}, [x6]
    601	xts_load_mask	v8
    602	xts_cts_skip_tw	w7, .Lxtsdecskiptw
    603	cbz		w7, .Lxtsdecnotfirst
    604
    605	enc_prepare	w3, x5, x8
    606	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
    607.Lxtsdecskiptw:
    608	dec_prepare	w3, x2, x8
    609	b		.LxtsdecNx
    610
    611.Lxtsdecnotfirst:
    612	dec_prepare	w3, x2, x8
    613.LxtsdecloopNx:
    614	next_tweak	v4, v4, v8
    615.LxtsdecNx:
    616	subs		w4, w4, #64
    617	bmi		.Lxtsdec1x
    618	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
    619	next_tweak	v5, v4, v8
    620	eor		v0.16b, v0.16b, v4.16b
    621	next_tweak	v6, v5, v8
    622	eor		v1.16b, v1.16b, v5.16b
    623	eor		v2.16b, v2.16b, v6.16b
    624	next_tweak	v7, v6, v8
    625	eor		v3.16b, v3.16b, v7.16b
    626	bl		aes_decrypt_block4x
    627	eor		v3.16b, v3.16b, v7.16b
    628	eor		v0.16b, v0.16b, v4.16b
    629	eor		v1.16b, v1.16b, v5.16b
    630	eor		v2.16b, v2.16b, v6.16b
    631	st1		{v0.16b-v3.16b}, [x0], #64
    632	mov		v4.16b, v7.16b
    633	cbz		w4, .Lxtsdecout
    634	xts_reload_mask	v8
    635	b		.LxtsdecloopNx
    636.Lxtsdec1x:
    637	adds		w4, w4, #64
    638	beq		.Lxtsdecout
    639	subs		w4, w4, #16
    640.Lxtsdecloop:
    641	ld1		{v0.16b}, [x1], #16
    642	bmi		.Lxtsdeccts
    643.Lxtsdecctsout:
    644	eor		v0.16b, v0.16b, v4.16b
    645	decrypt_block	v0, w3, x2, x8, w7
    646	eor		v0.16b, v0.16b, v4.16b
    647	st1		{v0.16b}, [x0], #16
    648	cbz		w4, .Lxtsdecout
    649	subs		w4, w4, #16
    650	next_tweak	v4, v4, v8
    651	b		.Lxtsdecloop
    652.Lxtsdecout:
    653	st1		{v4.16b}, [x6]
    654	ldp		x29, x30, [sp], #16
    655	ret
    656
    657.Lxtsdeccts:
    658	adr_l		x8, .Lcts_permute_table
    659
    660	add		x1, x1, w4, sxtw	/* rewind input pointer */
    661	add		w4, w4, #16		/* # bytes in final block */
    662	add		x9, x8, #32
    663	add		x8, x8, x4
    664	sub		x9, x9, x4
    665	add		x4, x0, x4		/* output address of final block */
    666
    667	next_tweak	v5, v4, v8
    668
    669	ld1		{v1.16b}, [x1]		/* load final block */
    670	ld1		{v2.16b}, [x8]
    671	ld1		{v3.16b}, [x9]
    672
    673	eor		v0.16b, v0.16b, v5.16b
    674	decrypt_block	v0, w3, x2, x8, w7
    675	eor		v0.16b, v0.16b, v5.16b
    676
    677	tbl		v2.16b, {v0.16b}, v2.16b
    678	tbx		v0.16b, {v1.16b}, v3.16b
    679
    680	st1		{v2.16b}, [x4]			/* overlapping stores */
    681	mov		w4, wzr
    682	b		.Lxtsdecctsout
    683AES_FUNC_END(aes_xts_decrypt)
    684
    685	/*
    686	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
    687	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
    688	 */
    689AES_FUNC_START(aes_mac_update)
    690	ld1		{v0.16b}, [x4]			/* get dg */
    691	enc_prepare	w2, x1, x7
    692	cbz		w5, .Lmacloop4x
    693
    694	encrypt_block	v0, w2, x1, x7, w8
    695
    696.Lmacloop4x:
    697	subs		w3, w3, #4
    698	bmi		.Lmac1x
    699	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
    700	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
    701	encrypt_block	v0, w2, x1, x7, w8
    702	eor		v0.16b, v0.16b, v2.16b
    703	encrypt_block	v0, w2, x1, x7, w8
    704	eor		v0.16b, v0.16b, v3.16b
    705	encrypt_block	v0, w2, x1, x7, w8
    706	eor		v0.16b, v0.16b, v4.16b
    707	cmp		w3, wzr
    708	csinv		x5, x6, xzr, eq
    709	cbz		w5, .Lmacout
    710	encrypt_block	v0, w2, x1, x7, w8
    711	st1		{v0.16b}, [x4]			/* return dg */
    712	cond_yield	.Lmacout, x7, x8
    713	b		.Lmacloop4x
    714.Lmac1x:
    715	add		w3, w3, #4
    716.Lmacloop:
    717	cbz		w3, .Lmacout
    718	ld1		{v1.16b}, [x0], #16		/* get next pt block */
    719	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
    720
    721	subs		w3, w3, #1
    722	csinv		x5, x6, xzr, eq
    723	cbz		w5, .Lmacout
    724
    725.Lmacenc:
    726	encrypt_block	v0, w2, x1, x7, w8
    727	b		.Lmacloop
    728
    729.Lmacout:
    730	st1		{v0.16b}, [x4]			/* return dg */
    731	mov		w0, w3
    732	ret
    733AES_FUNC_END(aes_mac_update)