aes-ce-core.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
aes-ce-core.S (15545B)
      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
      4 *
      5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
      6 */
      7
      8#include <linux/linkage.h>
      9#include <asm/assembler.h>
     10
     11	.text
     12	.arch		armv8-a
     13	.fpu		crypto-neon-fp-armv8
     14	.align		3
     15
     16	.macro		enc_round, state, key
     17	aese.8		\state, \key
     18	aesmc.8		\state, \state
     19	.endm
     20
     21	.macro		dec_round, state, key
     22	aesd.8		\state, \key
     23	aesimc.8	\state, \state
     24	.endm
     25
     26	.macro		enc_dround, key1, key2
     27	enc_round	q0, \key1
     28	enc_round	q0, \key2
     29	.endm
     30
     31	.macro		dec_dround, key1, key2
     32	dec_round	q0, \key1
     33	dec_round	q0, \key2
     34	.endm
     35
     36	.macro		enc_fround, key1, key2, key3
     37	enc_round	q0, \key1
     38	aese.8		q0, \key2
     39	veor		q0, q0, \key3
     40	.endm
     41
     42	.macro		dec_fround, key1, key2, key3
     43	dec_round	q0, \key1
     44	aesd.8		q0, \key2
     45	veor		q0, q0, \key3
     46	.endm
     47
     48	.macro		enc_dround_4x, key1, key2
     49	enc_round	q0, \key1
     50	enc_round	q1, \key1
     51	enc_round	q2, \key1
     52	enc_round	q3, \key1
     53	enc_round	q0, \key2
     54	enc_round	q1, \key2
     55	enc_round	q2, \key2
     56	enc_round	q3, \key2
     57	.endm
     58
     59	.macro		dec_dround_4x, key1, key2
     60	dec_round	q0, \key1
     61	dec_round	q1, \key1
     62	dec_round	q2, \key1
     63	dec_round	q3, \key1
     64	dec_round	q0, \key2
     65	dec_round	q1, \key2
     66	dec_round	q2, \key2
     67	dec_round	q3, \key2
     68	.endm
     69
     70	.macro		enc_fround_4x, key1, key2, key3
     71	enc_round	q0, \key1
     72	enc_round	q1, \key1
     73	enc_round	q2, \key1
     74	enc_round	q3, \key1
     75	aese.8		q0, \key2
     76	aese.8		q1, \key2
     77	aese.8		q2, \key2
     78	aese.8		q3, \key2
     79	veor		q0, q0, \key3
     80	veor		q1, q1, \key3
     81	veor		q2, q2, \key3
     82	veor		q3, q3, \key3
     83	.endm
     84
     85	.macro		dec_fround_4x, key1, key2, key3
     86	dec_round	q0, \key1
     87	dec_round	q1, \key1
     88	dec_round	q2, \key1
     89	dec_round	q3, \key1
     90	aesd.8		q0, \key2
     91	aesd.8		q1, \key2
     92	aesd.8		q2, \key2
     93	aesd.8		q3, \key2
     94	veor		q0, q0, \key3
     95	veor		q1, q1, \key3
     96	veor		q2, q2, \key3
     97	veor		q3, q3, \key3
     98	.endm
     99
    100	.macro		do_block, dround, fround
    101	cmp		r3, #12			@ which key size?
    102	vld1.32		{q10-q11}, [ip]!
    103	\dround		q8, q9
    104	vld1.32		{q12-q13}, [ip]!
    105	\dround		q10, q11
    106	vld1.32		{q10-q11}, [ip]!
    107	\dround		q12, q13
    108	vld1.32		{q12-q13}, [ip]!
    109	\dround		q10, q11
    110	blo		0f			@ AES-128: 10 rounds
    111	vld1.32		{q10-q11}, [ip]!
    112	\dround		q12, q13
    113	beq		1f			@ AES-192: 12 rounds
    114	vld1.32		{q12-q13}, [ip]
    115	\dround		q10, q11
    1160:	\fround		q12, q13, q14
    117	bx		lr
    118
    1191:	\fround		q10, q11, q14
    120	bx		lr
    121	.endm
    122
    123	/*
    124	 * Internal, non-AAPCS compliant functions that implement the core AES
    125	 * transforms. These should preserve all registers except q0 - q2 and ip
    126	 * Arguments:
    127	 *   q0        : first in/output block
    128	 *   q1        : second in/output block (_4x version only)
    129	 *   q2        : third in/output block (_4x version only)
    130	 *   q3        : fourth in/output block (_4x version only)
    131	 *   q8        : first round key
    132	 *   q9        : secound round key
    133	 *   q14       : final round key
    134	 *   r2        : address of round key array
    135	 *   r3        : number of rounds
    136	 */
    137	.align		6
    138aes_encrypt:
    139	add		ip, r2, #32		@ 3rd round key
    140.Laes_encrypt_tweak:
    141	do_block	enc_dround, enc_fround
    142ENDPROC(aes_encrypt)
    143
    144	.align		6
    145aes_decrypt:
    146	add		ip, r2, #32		@ 3rd round key
    147	do_block	dec_dround, dec_fround
    148ENDPROC(aes_decrypt)
    149
    150	.align		6
    151aes_encrypt_4x:
    152	add		ip, r2, #32		@ 3rd round key
    153	do_block	enc_dround_4x, enc_fround_4x
    154ENDPROC(aes_encrypt_4x)
    155
    156	.align		6
    157aes_decrypt_4x:
    158	add		ip, r2, #32		@ 3rd round key
    159	do_block	dec_dround_4x, dec_fround_4x
    160ENDPROC(aes_decrypt_4x)
    161
    162	.macro		prepare_key, rk, rounds
    163	add		ip, \rk, \rounds, lsl #4
    164	vld1.32		{q8-q9}, [\rk]		@ load first 2 round keys
    165	vld1.32		{q14}, [ip]		@ load last round key
    166	.endm
    167
    168	/*
    169	 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
    170	 *		   int blocks)
    171	 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
    172	 *		   int blocks)
    173	 */
    174ENTRY(ce_aes_ecb_encrypt)
    175	push		{r4, lr}
    176	ldr		r4, [sp, #8]
    177	prepare_key	r2, r3
    178.Lecbencloop4x:
    179	subs		r4, r4, #4
    180	bmi		.Lecbenc1x
    181	vld1.8		{q0-q1}, [r1]!
    182	vld1.8		{q2-q3}, [r1]!
    183	bl		aes_encrypt_4x
    184	vst1.8		{q0-q1}, [r0]!
    185	vst1.8		{q2-q3}, [r0]!
    186	b		.Lecbencloop4x
    187.Lecbenc1x:
    188	adds		r4, r4, #4
    189	beq		.Lecbencout
    190.Lecbencloop:
    191	vld1.8		{q0}, [r1]!
    192	bl		aes_encrypt
    193	vst1.8		{q0}, [r0]!
    194	subs		r4, r4, #1
    195	bne		.Lecbencloop
    196.Lecbencout:
    197	pop		{r4, pc}
    198ENDPROC(ce_aes_ecb_encrypt)
    199
    200ENTRY(ce_aes_ecb_decrypt)
    201	push		{r4, lr}
    202	ldr		r4, [sp, #8]
    203	prepare_key	r2, r3
    204.Lecbdecloop4x:
    205	subs		r4, r4, #4
    206	bmi		.Lecbdec1x
    207	vld1.8		{q0-q1}, [r1]!
    208	vld1.8		{q2-q3}, [r1]!
    209	bl		aes_decrypt_4x
    210	vst1.8		{q0-q1}, [r0]!
    211	vst1.8		{q2-q3}, [r0]!
    212	b		.Lecbdecloop4x
    213.Lecbdec1x:
    214	adds		r4, r4, #4
    215	beq		.Lecbdecout
    216.Lecbdecloop:
    217	vld1.8		{q0}, [r1]!
    218	bl		aes_decrypt
    219	vst1.8		{q0}, [r0]!
    220	subs		r4, r4, #1
    221	bne		.Lecbdecloop
    222.Lecbdecout:
    223	pop		{r4, pc}
    224ENDPROC(ce_aes_ecb_decrypt)
    225
    226	/*
    227	 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
    228	 *		   int blocks, u8 iv[])
    229	 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
    230	 *		   int blocks, u8 iv[])
    231	 */
    232ENTRY(ce_aes_cbc_encrypt)
    233	push		{r4-r6, lr}
    234	ldrd		r4, r5, [sp, #16]
    235	vld1.8		{q0}, [r5]
    236	prepare_key	r2, r3
    237.Lcbcencloop:
    238	vld1.8		{q1}, [r1]!		@ get next pt block
    239	veor		q0, q0, q1		@ ..and xor with iv
    240	bl		aes_encrypt
    241	vst1.8		{q0}, [r0]!
    242	subs		r4, r4, #1
    243	bne		.Lcbcencloop
    244	vst1.8		{q0}, [r5]
    245	pop		{r4-r6, pc}
    246ENDPROC(ce_aes_cbc_encrypt)
    247
    248ENTRY(ce_aes_cbc_decrypt)
    249	push		{r4-r6, lr}
    250	ldrd		r4, r5, [sp, #16]
    251	vld1.8		{q15}, [r5]		@ keep iv in q15
    252	prepare_key	r2, r3
    253.Lcbcdecloop4x:
    254	subs		r4, r4, #4
    255	bmi		.Lcbcdec1x
    256	vld1.8		{q0-q1}, [r1]!
    257	vld1.8		{q2-q3}, [r1]!
    258	vmov		q4, q0
    259	vmov		q5, q1
    260	vmov		q6, q2
    261	vmov		q7, q3
    262	bl		aes_decrypt_4x
    263	veor		q0, q0, q15
    264	veor		q1, q1, q4
    265	veor		q2, q2, q5
    266	veor		q3, q3, q6
    267	vmov		q15, q7
    268	vst1.8		{q0-q1}, [r0]!
    269	vst1.8		{q2-q3}, [r0]!
    270	b		.Lcbcdecloop4x
    271.Lcbcdec1x:
    272	adds		r4, r4, #4
    273	beq		.Lcbcdecout
    274	vmov		q6, q14			@ preserve last round key
    275.Lcbcdecloop:
    276	vld1.8		{q0}, [r1]!		@ get next ct block
    277	veor		q14, q15, q6		@ combine prev ct with last key
    278	vmov		q15, q0
    279	bl		aes_decrypt
    280	vst1.8		{q0}, [r0]!
    281	subs		r4, r4, #1
    282	bne		.Lcbcdecloop
    283.Lcbcdecout:
    284	vst1.8		{q15}, [r5]		@ keep iv in q15
    285	pop		{r4-r6, pc}
    286ENDPROC(ce_aes_cbc_decrypt)
    287
    288
    289	/*
    290	 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
    291	 *			  int rounds, int bytes, u8 const iv[])
    292	 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
    293	 *			  int rounds, int bytes, u8 const iv[])
    294	 */
    295
    296ENTRY(ce_aes_cbc_cts_encrypt)
    297	push		{r4-r6, lr}
    298	ldrd		r4, r5, [sp, #16]
    299
    300	movw		ip, :lower16:.Lcts_permute_table
    301	movt		ip, :upper16:.Lcts_permute_table
    302	sub		r4, r4, #16
    303	add		lr, ip, #32
    304	add		ip, ip, r4
    305	sub		lr, lr, r4
    306	vld1.8		{q5}, [ip]
    307	vld1.8		{q6}, [lr]
    308
    309	add		ip, r1, r4
    310	vld1.8		{q0}, [r1]			@ overlapping loads
    311	vld1.8		{q3}, [ip]
    312
    313	vld1.8		{q1}, [r5]			@ get iv
    314	prepare_key	r2, r3
    315
    316	veor		q0, q0, q1			@ xor with iv
    317	bl		aes_encrypt
    318
    319	vtbl.8		d4, {d0-d1}, d10
    320	vtbl.8		d5, {d0-d1}, d11
    321	vtbl.8		d2, {d6-d7}, d12
    322	vtbl.8		d3, {d6-d7}, d13
    323
    324	veor		q0, q0, q1
    325	bl		aes_encrypt
    326
    327	add		r4, r0, r4
    328	vst1.8		{q2}, [r4]			@ overlapping stores
    329	vst1.8		{q0}, [r0]
    330
    331	pop		{r4-r6, pc}
    332ENDPROC(ce_aes_cbc_cts_encrypt)
    333
    334ENTRY(ce_aes_cbc_cts_decrypt)
    335	push		{r4-r6, lr}
    336	ldrd		r4, r5, [sp, #16]
    337
    338	movw		ip, :lower16:.Lcts_permute_table
    339	movt		ip, :upper16:.Lcts_permute_table
    340	sub		r4, r4, #16
    341	add		lr, ip, #32
    342	add		ip, ip, r4
    343	sub		lr, lr, r4
    344	vld1.8		{q5}, [ip]
    345	vld1.8		{q6}, [lr]
    346
    347	add		ip, r1, r4
    348	vld1.8		{q0}, [r1]			@ overlapping loads
    349	vld1.8		{q1}, [ip]
    350
    351	vld1.8		{q3}, [r5]			@ get iv
    352	prepare_key	r2, r3
    353
    354	bl		aes_decrypt
    355
    356	vtbl.8		d4, {d0-d1}, d10
    357	vtbl.8		d5, {d0-d1}, d11
    358	vtbx.8		d0, {d2-d3}, d12
    359	vtbx.8		d1, {d2-d3}, d13
    360
    361	veor		q1, q1, q2
    362	bl		aes_decrypt
    363	veor		q0, q0, q3			@ xor with iv
    364
    365	add		r4, r0, r4
    366	vst1.8		{q1}, [r4]			@ overlapping stores
    367	vst1.8		{q0}, [r0]
    368
    369	pop		{r4-r6, pc}
    370ENDPROC(ce_aes_cbc_cts_decrypt)
    371
    372
    373	/*
    374	 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
    375	 *		   int blocks, u8 ctr[])
    376	 */
    377ENTRY(ce_aes_ctr_encrypt)
    378	push		{r4-r6, lr}
    379	ldrd		r4, r5, [sp, #16]
    380	vld1.8		{q7}, [r5]		@ load ctr
    381	prepare_key	r2, r3
    382	vmov		r6, s31			@ keep swabbed ctr in r6
    383	rev		r6, r6
    384	cmn		r6, r4			@ 32 bit overflow?
    385	bcs		.Lctrloop
    386.Lctrloop4x:
    387	subs		r4, r4, #4
    388	bmi		.Lctr1x
    389
    390	/*
    391	 * NOTE: the sequence below has been carefully tweaked to avoid
    392	 * a silicon erratum that exists in Cortex-A57 (#1742098) and
    393	 * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
    394	 * may produce an incorrect result if they take their input from a
    395	 * register of which a single 32-bit lane has been updated the last
    396	 * time it was modified. To work around this, the lanes of registers
    397	 * q0-q3 below are not manipulated individually, and the different
    398	 * counter values are prepared by successive manipulations of q7.
    399	 */
    400	add		ip, r6, #1
    401	vmov		q0, q7
    402	rev		ip, ip
    403	add		lr, r6, #2
    404	vmov		s31, ip			@ set lane 3 of q1 via q7
    405	add		ip, r6, #3
    406	rev		lr, lr
    407	vmov		q1, q7
    408	vmov		s31, lr			@ set lane 3 of q2 via q7
    409	rev		ip, ip
    410	vmov		q2, q7
    411	vmov		s31, ip			@ set lane 3 of q3 via q7
    412	add		r6, r6, #4
    413	vmov		q3, q7
    414
    415	vld1.8		{q4-q5}, [r1]!
    416	vld1.8		{q6}, [r1]!
    417	vld1.8		{q15}, [r1]!
    418	bl		aes_encrypt_4x
    419	veor		q0, q0, q4
    420	veor		q1, q1, q5
    421	veor		q2, q2, q6
    422	veor		q3, q3, q15
    423	rev		ip, r6
    424	vst1.8		{q0-q1}, [r0]!
    425	vst1.8		{q2-q3}, [r0]!
    426	vmov		s31, ip
    427	b		.Lctrloop4x
    428.Lctr1x:
    429	adds		r4, r4, #4
    430	beq		.Lctrout
    431.Lctrloop:
    432	vmov		q0, q7
    433	bl		aes_encrypt
    434
    435	adds		r6, r6, #1		@ increment BE ctr
    436	rev		ip, r6
    437	vmov		s31, ip
    438	bcs		.Lctrcarry
    439
    440.Lctrcarrydone:
    441	subs		r4, r4, #1
    442	bmi		.Lctrtailblock		@ blocks < 0 means tail block
    443	vld1.8		{q3}, [r1]!
    444	veor		q3, q0, q3
    445	vst1.8		{q3}, [r0]!
    446	bne		.Lctrloop
    447
    448.Lctrout:
    449	vst1.8		{q7}, [r5]		@ return next CTR value
    450	pop		{r4-r6, pc}
    451
    452.Lctrtailblock:
    453	vst1.8		{q0}, [r0, :64]		@ return the key stream
    454	b		.Lctrout
    455
    456.Lctrcarry:
    457	.irp		sreg, s30, s29, s28
    458	vmov		ip, \sreg		@ load next word of ctr
    459	rev		ip, ip			@ ... to handle the carry
    460	adds		ip, ip, #1
    461	rev		ip, ip
    462	vmov		\sreg, ip
    463	bcc		.Lctrcarrydone
    464	.endr
    465	b		.Lctrcarrydone
    466ENDPROC(ce_aes_ctr_encrypt)
    467
    468	/*
    469	 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
    470	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
    471	 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
    472	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
    473	 */
    474
    475	.macro		next_tweak, out, in, const, tmp
    476	vshr.s64	\tmp, \in, #63
    477	vand		\tmp, \tmp, \const
    478	vadd.u64	\out, \in, \in
    479	vext.8		\tmp, \tmp, \tmp, #8
    480	veor		\out, \out, \tmp
    481	.endm
    482
    483ce_aes_xts_init:
    484	vmov.i32	d30, #0x87		@ compose tweak mask vector
    485	vmovl.u32	q15, d30
    486	vshr.u64	d30, d31, #7
    487
    488	ldrd		r4, r5, [sp, #16]	@ load args
    489	ldr		r6, [sp, #28]
    490	vld1.8		{q0}, [r5]		@ load iv
    491	teq		r6, #1			@ start of a block?
    492	bxne		lr
    493
    494	@ Encrypt the IV in q0 with the second AES key. This should only
    495	@ be done at the start of a block.
    496	ldr		r6, [sp, #24]		@ load AES key 2
    497	prepare_key	r6, r3
    498	add		ip, r6, #32		@ 3rd round key of key 2
    499	b		.Laes_encrypt_tweak	@ tail call
    500ENDPROC(ce_aes_xts_init)
    501
    502ENTRY(ce_aes_xts_encrypt)
    503	push		{r4-r6, lr}
    504
    505	bl		ce_aes_xts_init		@ run shared prologue
    506	prepare_key	r2, r3
    507	vmov		q4, q0
    508
    509	teq		r6, #0			@ start of a block?
    510	bne		.Lxtsenc4x
    511
    512.Lxtsencloop4x:
    513	next_tweak	q4, q4, q15, q10
    514.Lxtsenc4x:
    515	subs		r4, r4, #64
    516	bmi		.Lxtsenc1x
    517	vld1.8		{q0-q1}, [r1]!		@ get 4 pt blocks
    518	vld1.8		{q2-q3}, [r1]!
    519	next_tweak	q5, q4, q15, q10
    520	veor		q0, q0, q4
    521	next_tweak	q6, q5, q15, q10
    522	veor		q1, q1, q5
    523	next_tweak	q7, q6, q15, q10
    524	veor		q2, q2, q6
    525	veor		q3, q3, q7
    526	bl		aes_encrypt_4x
    527	veor		q0, q0, q4
    528	veor		q1, q1, q5
    529	veor		q2, q2, q6
    530	veor		q3, q3, q7
    531	vst1.8		{q0-q1}, [r0]!		@ write 4 ct blocks
    532	vst1.8		{q2-q3}, [r0]!
    533	vmov		q4, q7
    534	teq		r4, #0
    535	beq		.Lxtsencret
    536	b		.Lxtsencloop4x
    537.Lxtsenc1x:
    538	adds		r4, r4, #64
    539	beq		.Lxtsencout
    540	subs		r4, r4, #16
    541	bmi		.LxtsencctsNx
    542.Lxtsencloop:
    543	vld1.8		{q0}, [r1]!
    544.Lxtsencctsout:
    545	veor		q0, q0, q4
    546	bl		aes_encrypt
    547	veor		q0, q0, q4
    548	teq		r4, #0
    549	beq		.Lxtsencout
    550	subs		r4, r4, #16
    551	next_tweak	q4, q4, q15, q6
    552	bmi		.Lxtsenccts
    553	vst1.8		{q0}, [r0]!
    554	b		.Lxtsencloop
    555.Lxtsencout:
    556	vst1.8		{q0}, [r0]
    557.Lxtsencret:
    558	vst1.8		{q4}, [r5]
    559	pop		{r4-r6, pc}
    560
    561.LxtsencctsNx:
    562	vmov		q0, q3
    563	sub		r0, r0, #16
    564.Lxtsenccts:
    565	movw		ip, :lower16:.Lcts_permute_table
    566	movt		ip, :upper16:.Lcts_permute_table
    567
    568	add		r1, r1, r4		@ rewind input pointer
    569	add		r4, r4, #16		@ # bytes in final block
    570	add		lr, ip, #32
    571	add		ip, ip, r4
    572	sub		lr, lr, r4
    573	add		r4, r0, r4		@ output address of final block
    574
    575	vld1.8		{q1}, [r1]		@ load final partial block
    576	vld1.8		{q2}, [ip]
    577	vld1.8		{q3}, [lr]
    578
    579	vtbl.8		d4, {d0-d1}, d4
    580	vtbl.8		d5, {d0-d1}, d5
    581	vtbx.8		d0, {d2-d3}, d6
    582	vtbx.8		d1, {d2-d3}, d7
    583
    584	vst1.8		{q2}, [r4]		@ overlapping stores
    585	mov		r4, #0
    586	b		.Lxtsencctsout
    587ENDPROC(ce_aes_xts_encrypt)
    588
    589
    590ENTRY(ce_aes_xts_decrypt)
    591	push		{r4-r6, lr}
    592
    593	bl		ce_aes_xts_init		@ run shared prologue
    594	prepare_key	r2, r3
    595	vmov		q4, q0
    596
    597	/* subtract 16 bytes if we are doing CTS */
    598	tst		r4, #0xf
    599	subne		r4, r4, #0x10
    600
    601	teq		r6, #0			@ start of a block?
    602	bne		.Lxtsdec4x
    603
    604.Lxtsdecloop4x:
    605	next_tweak	q4, q4, q15, q10
    606.Lxtsdec4x:
    607	subs		r4, r4, #64
    608	bmi		.Lxtsdec1x
    609	vld1.8		{q0-q1}, [r1]!		@ get 4 ct blocks
    610	vld1.8		{q2-q3}, [r1]!
    611	next_tweak	q5, q4, q15, q10
    612	veor		q0, q0, q4
    613	next_tweak	q6, q5, q15, q10
    614	veor		q1, q1, q5
    615	next_tweak	q7, q6, q15, q10
    616	veor		q2, q2, q6
    617	veor		q3, q3, q7
    618	bl		aes_decrypt_4x
    619	veor		q0, q0, q4
    620	veor		q1, q1, q5
    621	veor		q2, q2, q6
    622	veor		q3, q3, q7
    623	vst1.8		{q0-q1}, [r0]!		@ write 4 pt blocks
    624	vst1.8		{q2-q3}, [r0]!
    625	vmov		q4, q7
    626	teq		r4, #0
    627	beq		.Lxtsdecout
    628	b		.Lxtsdecloop4x
    629.Lxtsdec1x:
    630	adds		r4, r4, #64
    631	beq		.Lxtsdecout
    632	subs		r4, r4, #16
    633.Lxtsdecloop:
    634	vld1.8		{q0}, [r1]!
    635	bmi		.Lxtsdeccts
    636.Lxtsdecctsout:
    637	veor		q0, q0, q4
    638	bl		aes_decrypt
    639	veor		q0, q0, q4
    640	vst1.8		{q0}, [r0]!
    641	teq		r4, #0
    642	beq		.Lxtsdecout
    643	subs		r4, r4, #16
    644	next_tweak	q4, q4, q15, q6
    645	b		.Lxtsdecloop
    646.Lxtsdecout:
    647	vst1.8		{q4}, [r5]
    648	pop		{r4-r6, pc}
    649
    650.Lxtsdeccts:
    651	movw		ip, :lower16:.Lcts_permute_table
    652	movt		ip, :upper16:.Lcts_permute_table
    653
    654	add		r1, r1, r4		@ rewind input pointer
    655	add		r4, r4, #16		@ # bytes in final block
    656	add		lr, ip, #32
    657	add		ip, ip, r4
    658	sub		lr, lr, r4
    659	add		r4, r0, r4		@ output address of final block
    660
    661	next_tweak	q5, q4, q15, q6
    662
    663	vld1.8		{q1}, [r1]		@ load final partial block
    664	vld1.8		{q2}, [ip]
    665	vld1.8		{q3}, [lr]
    666
    667	veor		q0, q0, q5
    668	bl		aes_decrypt
    669	veor		q0, q0, q5
    670
    671	vtbl.8		d4, {d0-d1}, d4
    672	vtbl.8		d5, {d0-d1}, d5
    673	vtbx.8		d0, {d2-d3}, d6
    674	vtbx.8		d1, {d2-d3}, d7
    675
    676	vst1.8		{q2}, [r4]		@ overlapping stores
    677	mov		r4, #0
    678	b		.Lxtsdecctsout
    679ENDPROC(ce_aes_xts_decrypt)
    680
    681	/*
    682	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
    683	 *                             AES sbox substitution on each byte in
    684	 *                             'input'
    685	 */
    686ENTRY(ce_aes_sub)
    687	vdup.32		q1, r0
    688	veor		q0, q0, q0
    689	aese.8		q0, q1
    690	vmov		r0, s0
    691	bx		lr
    692ENDPROC(ce_aes_sub)
    693
    694	/*
    695	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
    696	 *                                        operation on round key *src
    697	 */
    698ENTRY(ce_aes_invert)
    699	vld1.32		{q0}, [r1]
    700	aesimc.8	q0, q0
    701	vst1.32		{q0}, [r0]
    702	bx		lr
    703ENDPROC(ce_aes_invert)
    704
    705	.section	".rodata", "a"
    706	.align		6
    707.Lcts_permute_table:
    708	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    709	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    710	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
    711	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
    712	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    713	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff