aes-neonbs-core.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
aes-neonbs-core.S (22660B)
      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Bit sliced AES using NEON instructions
      4 *
      5 * Copyright (C) 2017 Linaro Ltd.
      6 * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
      7 */
      8
      9/*
     10 * The algorithm implemented here is described in detail by the paper
     11 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
     12 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
     13 *
     14 * This implementation is based primarily on the OpenSSL implementation
     15 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
     16 */
     17
     18#include <linux/linkage.h>
     19#include <asm/assembler.h>
     20
     21	.text
     22	.fpu		neon
     23
     24	rounds		.req	ip
     25	bskey		.req	r4
     26
     27	q0l		.req	d0
     28	q0h		.req	d1
     29	q1l		.req	d2
     30	q1h		.req	d3
     31	q2l		.req	d4
     32	q2h		.req	d5
     33	q3l		.req	d6
     34	q3h		.req	d7
     35	q4l		.req	d8
     36	q4h		.req	d9
     37	q5l		.req	d10
     38	q5h		.req	d11
     39	q6l		.req	d12
     40	q6h		.req	d13
     41	q7l		.req	d14
     42	q7h		.req	d15
     43	q8l		.req	d16
     44	q8h		.req	d17
     45	q9l		.req	d18
     46	q9h		.req	d19
     47	q10l		.req	d20
     48	q10h		.req	d21
     49	q11l		.req	d22
     50	q11h		.req	d23
     51	q12l		.req	d24
     52	q12h		.req	d25
     53	q13l		.req	d26
     54	q13h		.req	d27
     55	q14l		.req	d28
     56	q14h		.req	d29
     57	q15l		.req	d30
     58	q15h		.req	d31
     59
     60	.macro		__tbl, out, tbl, in, tmp
     61	.ifc		\out, \tbl
     62	.ifb		\tmp
     63	.error		__tbl needs temp register if out == tbl
     64	.endif
     65	vmov		\tmp, \out
     66	.endif
     67	vtbl.8		\out\()l, {\tbl}, \in\()l
     68	.ifc		\out, \tbl
     69	vtbl.8		\out\()h, {\tmp}, \in\()h
     70	.else
     71	vtbl.8		\out\()h, {\tbl}, \in\()h
     72	.endif
     73	.endm
     74
     75	.macro		__ldr, out, sym
     76	vldr		\out\()l, \sym
     77	vldr		\out\()h, \sym + 8
     78	.endm
     79
     80	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
     81	veor		\b2, \b2, \b1
     82	veor		\b5, \b5, \b6
     83	veor		\b3, \b3, \b0
     84	veor		\b6, \b6, \b2
     85	veor		\b5, \b5, \b0
     86	veor		\b6, \b6, \b3
     87	veor		\b3, \b3, \b7
     88	veor		\b7, \b7, \b5
     89	veor		\b3, \b3, \b4
     90	veor		\b4, \b4, \b5
     91	veor		\b2, \b2, \b7
     92	veor		\b3, \b3, \b1
     93	veor		\b1, \b1, \b5
     94	.endm
     95
     96	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
     97	veor		\b0, \b0, \b6
     98	veor		\b1, \b1, \b4
     99	veor		\b4, \b4, \b6
    100	veor		\b2, \b2, \b0
    101	veor		\b6, \b6, \b1
    102	veor		\b1, \b1, \b5
    103	veor		\b5, \b5, \b3
    104	veor		\b3, \b3, \b7
    105	veor		\b7, \b7, \b5
    106	veor		\b2, \b2, \b5
    107	veor		\b4, \b4, \b7
    108	.endm
    109
    110	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
    111	veor		\b1, \b1, \b7
    112	veor		\b4, \b4, \b7
    113	veor		\b7, \b7, \b5
    114	veor		\b1, \b1, \b3
    115	veor		\b2, \b2, \b5
    116	veor		\b3, \b3, \b7
    117	veor		\b6, \b6, \b1
    118	veor		\b2, \b2, \b0
    119	veor		\b5, \b5, \b3
    120	veor		\b4, \b4, \b6
    121	veor		\b0, \b0, \b6
    122	veor		\b1, \b1, \b4
    123	.endm
    124
    125	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
    126	veor		\b1, \b1, \b5
    127	veor		\b2, \b2, \b7
    128	veor		\b3, \b3, \b1
    129	veor		\b4, \b4, \b5
    130	veor		\b7, \b7, \b5
    131	veor		\b3, \b3, \b4
    132	veor 		\b5, \b5, \b0
    133	veor		\b3, \b3, \b7
    134	veor		\b6, \b6, \b2
    135	veor		\b2, \b2, \b1
    136	veor		\b6, \b6, \b3
    137	veor		\b3, \b3, \b0
    138	veor		\b5, \b5, \b6
    139	.endm
    140
    141	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
    142	veor 		\t0, \y0, \y1
    143	vand		\t0, \t0, \x0
    144	veor		\x0, \x0, \x1
    145	vand		\t1, \x1, \y0
    146	vand		\x0, \x0, \y1
    147	veor		\x1, \t1, \t0
    148	veor		\x0, \x0, \t1
    149	.endm
    150
    151	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
    152	veor		\t0, \y0, \y1
    153	veor 		\t1, \y2, \y3
    154	vand		\t0, \t0, \x0
    155	vand		\t1, \t1, \x2
    156	veor		\x0, \x0, \x1
    157	veor		\x2, \x2, \x3
    158	vand		\x1, \x1, \y0
    159	vand		\x3, \x3, \y2
    160	vand		\x0, \x0, \y1
    161	vand		\x2, \x2, \y3
    162	veor		\x1, \x1, \x0
    163	veor		\x2, \x2, \x3
    164	veor		\x0, \x0, \t0
    165	veor		\x3, \x3, \t1
    166	.endm
    167
    168	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
    169				    y0, y1, y2, y3, t0, t1, t2, t3
    170	veor		\t0, \x0, \x2
    171	veor		\t1, \x1, \x3
    172	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
    173	veor		\y0, \y0, \y2
    174	veor		\y1, \y1, \y3
    175	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
    176	veor		\x0, \x0, \t0
    177	veor		\x2, \x2, \t0
    178	veor		\x1, \x1, \t1
    179	veor		\x3, \x3, \t1
    180	veor		\t0, \x4, \x6
    181	veor		\t1, \x5, \x7
    182	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
    183	veor		\y0, \y0, \y2
    184	veor		\y1, \y1, \y3
    185	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
    186	veor		\x4, \x4, \t0
    187	veor		\x6, \x6, \t0
    188	veor		\x5, \x5, \t1
    189	veor		\x7, \x7, \t1
    190	.endm
    191
    192	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
    193				   t0, t1, t2, t3, s0, s1, s2, s3
    194	veor		\t3, \x4, \x6
    195	veor		\t0, \x5, \x7
    196	veor		\t1, \x1, \x3
    197	veor		\s1, \x7, \x6
    198	veor		\s0, \x0, \x2
    199	veor		\s3, \t3, \t0
    200	vorr		\t2, \t0, \t1
    201	vand		\s2, \t3, \s0
    202	vorr		\t3, \t3, \s0
    203	veor		\s0, \s0, \t1
    204	vand		\t0, \t0, \t1
    205	veor		\t1, \x3, \x2
    206	vand		\s3, \s3, \s0
    207	vand		\s1, \s1, \t1
    208	veor		\t1, \x4, \x5
    209	veor		\s0, \x1, \x0
    210	veor		\t3, \t3, \s1
    211	veor		\t2, \t2, \s1
    212	vand		\s1, \t1, \s0
    213	vorr		\t1, \t1, \s0
    214	veor		\t3, \t3, \s3
    215	veor		\t0, \t0, \s1
    216	veor		\t2, \t2, \s2
    217	veor		\t1, \t1, \s3
    218	veor		\t0, \t0, \s2
    219	vand		\s0, \x7, \x3
    220	veor		\t1, \t1, \s2
    221	vand		\s1, \x6, \x2
    222	vand		\s2, \x5, \x1
    223	vorr		\s3, \x4, \x0
    224	veor		\t3, \t3, \s0
    225	veor		\t1, \t1, \s2
    226	veor		\s0, \t0, \s3
    227	veor		\t2, \t2, \s1
    228	vand		\s2, \t3, \t1
    229	veor		\s1, \t2, \s2
    230	veor		\s3, \s0, \s2
    231	vbsl		\s1, \t1, \s0
    232	vmvn		\t0, \s0
    233	vbsl		\s0, \s1, \s3
    234	vbsl		\t0, \s1, \s3
    235	vbsl		\s3, \t3, \t2
    236	veor		\t3, \t3, \t2
    237	vand		\s2, \s0, \s3
    238	veor		\t1, \t1, \t0
    239	veor		\s2, \s2, \t3
    240	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
    241			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
    242	.endm
    243
    244	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
    245			      t0, t1, t2, t3, s0, s1, s2, s3
    246	in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
    247	inv_gf256	\b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
    248			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
    249	out_bs_ch	\b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
    250	.endm
    251
    252	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
    253				  t0, t1, t2, t3, s0, s1, s2, s3
    254	inv_in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
    255	inv_gf256	\b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
    256			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
    257	inv_out_bs_ch	\b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
    258	.endm
    259
    260	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
    261				    t0, t1, t2, t3, mask
    262	vld1.8		{\t0-\t1}, [bskey, :256]!
    263	veor		\t0, \t0, \x0
    264	vld1.8		{\t2-\t3}, [bskey, :256]!
    265	veor		\t1, \t1, \x1
    266	__tbl		\x0, \t0, \mask
    267	veor		\t2, \t2, \x2
    268	__tbl		\x1, \t1, \mask
    269	vld1.8		{\t0-\t1}, [bskey, :256]!
    270	veor		\t3, \t3, \x3
    271	__tbl		\x2, \t2, \mask
    272	__tbl		\x3, \t3, \mask
    273	vld1.8		{\t2-\t3}, [bskey, :256]!
    274	veor		\t0, \t0, \x4
    275	veor		\t1, \t1, \x5
    276	__tbl		\x4, \t0, \mask
    277	veor		\t2, \t2, \x6
    278	__tbl		\x5, \t1, \mask
    279	veor		\t3, \t3, \x7
    280	__tbl		\x6, \t2, \mask
    281	__tbl		\x7, \t3, \mask
    282	.endm
    283
    284	.macro		inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
    285					t0, t1, t2, t3, mask
    286	__tbl		\x0, \x0, \mask, \t0
    287	__tbl		\x1, \x1, \mask, \t1
    288	__tbl		\x2, \x2, \mask, \t2
    289	__tbl		\x3, \x3, \mask, \t3
    290	__tbl		\x4, \x4, \mask, \t0
    291	__tbl		\x5, \x5, \mask, \t1
    292	__tbl		\x6, \x6, \mask, \t2
    293	__tbl		\x7, \x7, \mask, \t3
    294	.endm
    295
    296	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
    297				  t0, t1, t2, t3, t4, t5, t6, t7, inv
    298	vext.8		\t0, \x0, \x0, #12
    299	vext.8		\t1, \x1, \x1, #12
    300	veor		\x0, \x0, \t0
    301	vext.8		\t2, \x2, \x2, #12
    302	veor		\x1, \x1, \t1
    303	vext.8		\t3, \x3, \x3, #12
    304	veor		\x2, \x2, \t2
    305	vext.8		\t4, \x4, \x4, #12
    306	veor		\x3, \x3, \t3
    307	vext.8		\t5, \x5, \x5, #12
    308	veor		\x4, \x4, \t4
    309	vext.8		\t6, \x6, \x6, #12
    310	veor		\x5, \x5, \t5
    311	vext.8		\t7, \x7, \x7, #12
    312	veor		\x6, \x6, \t6
    313	veor		\t1, \t1, \x0
    314	veor.8		\x7, \x7, \t7
    315	vext.8		\x0, \x0, \x0, #8
    316	veor		\t2, \t2, \x1
    317	veor		\t0, \t0, \x7
    318	veor		\t1, \t1, \x7
    319	vext.8		\x1, \x1, \x1, #8
    320	veor		\t5, \t5, \x4
    321	veor		\x0, \x0, \t0
    322	veor		\t6, \t6, \x5
    323	veor		\x1, \x1, \t1
    324	vext.8		\t0, \x4, \x4, #8
    325	veor		\t4, \t4, \x3
    326	vext.8		\t1, \x5, \x5, #8
    327	veor		\t7, \t7, \x6
    328	vext.8		\x4, \x3, \x3, #8
    329	veor		\t3, \t3, \x2
    330	vext.8		\x5, \x7, \x7, #8
    331	veor		\t4, \t4, \x7
    332	vext.8		\x3, \x6, \x6, #8
    333	veor		\t3, \t3, \x7
    334	vext.8		\x6, \x2, \x2, #8
    335	veor		\x7, \t1, \t5
    336	.ifb		\inv
    337	veor		\x2, \t0, \t4
    338	veor		\x4, \x4, \t3
    339	veor		\x5, \x5, \t7
    340	veor		\x3, \x3, \t6
    341	veor		\x6, \x6, \t2
    342	.else
    343	veor		\t3, \t3, \x4
    344	veor		\x5, \x5, \t7
    345	veor		\x2, \x3, \t6
    346	veor		\x3, \t0, \t4
    347	veor		\x4, \x6, \t2
    348	vmov		\x6, \t3
    349	.endif
    350	.endm
    351
    352	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
    353				      t0, t1, t2, t3, t4, t5, t6, t7
    354	vld1.8		{\t0-\t1}, [bskey, :256]!
    355	veor		\x0, \x0, \t0
    356	vld1.8		{\t2-\t3}, [bskey, :256]!
    357	veor		\x1, \x1, \t1
    358	vld1.8		{\t4-\t5}, [bskey, :256]!
    359	veor		\x2, \x2, \t2
    360	vld1.8		{\t6-\t7}, [bskey, :256]
    361	sub		bskey, bskey, #224
    362	veor		\x3, \x3, \t3
    363	veor		\x4, \x4, \t4
    364	veor		\x5, \x5, \t5
    365	veor		\x6, \x6, \t6
    366	veor		\x7, \x7, \t7
    367	vext.8		\t0, \x0, \x0, #8
    368	vext.8		\t6, \x6, \x6, #8
    369	vext.8		\t7, \x7, \x7, #8
    370	veor		\t0, \t0, \x0
    371	vext.8		\t1, \x1, \x1, #8
    372	veor		\t6, \t6, \x6
    373	vext.8		\t2, \x2, \x2, #8
    374	veor		\t7, \t7, \x7
    375	vext.8		\t3, \x3, \x3, #8
    376	veor		\t1, \t1, \x1
    377	vext.8		\t4, \x4, \x4, #8
    378	veor		\t2, \t2, \x2
    379	vext.8		\t5, \x5, \x5, #8
    380	veor		\t3, \t3, \x3
    381	veor		\t4, \t4, \x4
    382	veor		\t5, \t5, \x5
    383	veor		\x0, \x0, \t6
    384	veor		\x1, \x1, \t6
    385	veor		\x2, \x2, \t0
    386	veor		\x4, \x4, \t2
    387	veor		\x3, \x3, \t1
    388	veor		\x1, \x1, \t7
    389	veor		\x2, \x2, \t7
    390	veor		\x4, \x4, \t6
    391	veor		\x5, \x5, \t3
    392	veor		\x3, \x3, \t6
    393	veor		\x6, \x6, \t4
    394	veor		\x4, \x4, \t7
    395	veor		\x5, \x5, \t7
    396	veor		\x7, \x7, \t5
    397	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
    398			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
    399	.endm
    400
    401	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
    402	vshr.u64	\t0, \b0, #\n
    403	vshr.u64	\t1, \b1, #\n
    404	veor		\t0, \t0, \a0
    405	veor		\t1, \t1, \a1
    406	vand		\t0, \t0, \mask
    407	vand		\t1, \t1, \mask
    408	veor		\a0, \a0, \t0
    409	vshl.s64	\t0, \t0, #\n
    410	veor		\a1, \a1, \t1
    411	vshl.s64	\t1, \t1, #\n
    412	veor		\b0, \b0, \t0
    413	veor		\b1, \b1, \t1
    414	.endm
    415
    416	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
    417	vmov.i8		\t0, #0x55
    418	vmov.i8		\t1, #0x33
    419	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
    420	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
    421	vmov.i8		\t0, #0x0f
    422	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
    423	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
    424	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
    425	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
    426	.endm
    427
    428	.align		4
    429M0:	.quad		0x02060a0e03070b0f, 0x0004080c0105090d
    430
    431	/*
    432	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
    433	 */
    434ENTRY(aesbs_convert_key)
    435	vld1.32		{q7}, [r1]!		// load round 0 key
    436	vld1.32		{q15}, [r1]!		// load round 1 key
    437
    438	vmov.i8		q8,  #0x01		// bit masks
    439	vmov.i8		q9,  #0x02
    440	vmov.i8		q10, #0x04
    441	vmov.i8		q11, #0x08
    442	vmov.i8		q12, #0x10
    443	vmov.i8		q13, #0x20
    444	__ldr		q14, M0
    445
    446	sub		r2, r2, #1
    447	vst1.8		{q7}, [r0, :128]!	// save round 0 key
    448
    449.Lkey_loop:
    450	__tbl		q7, q15, q14
    451	vmov.i8		q6, #0x40
    452	vmov.i8		q15, #0x80
    453
    454	vtst.8		q0, q7, q8
    455	vtst.8		q1, q7, q9
    456	vtst.8		q2, q7, q10
    457	vtst.8		q3, q7, q11
    458	vtst.8		q4, q7, q12
    459	vtst.8		q5, q7, q13
    460	vtst.8		q6, q7, q6
    461	vtst.8		q7, q7, q15
    462	vld1.32		{q15}, [r1]!		// load next round key
    463	vmvn		q0, q0
    464	vmvn		q1, q1
    465	vmvn		q5, q5
    466	vmvn		q6, q6
    467
    468	subs		r2, r2, #1
    469	vst1.8		{q0-q1}, [r0, :256]!
    470	vst1.8		{q2-q3}, [r0, :256]!
    471	vst1.8		{q4-q5}, [r0, :256]!
    472	vst1.8		{q6-q7}, [r0, :256]!
    473	bne		.Lkey_loop
    474
    475	vmov.i8		q7, #0x63		// compose .L63
    476	veor		q15, q15, q7
    477	vst1.8		{q15}, [r0, :128]
    478	bx		lr
    479ENDPROC(aesbs_convert_key)
    480
    481	.align		4
    482M0SR:	.quad		0x0a0e02060f03070b, 0x0004080c05090d01
    483
    484aesbs_encrypt8:
    485	vld1.8		{q9}, [bskey, :128]!	// round 0 key
    486	__ldr		q8, M0SR
    487
    488	veor		q10, q0, q9		// xor with round0 key
    489	veor		q11, q1, q9
    490	__tbl		q0, q10, q8
    491	veor		q12, q2, q9
    492	__tbl		q1, q11, q8
    493	veor		q13, q3, q9
    494	__tbl		q2, q12, q8
    495	veor		q14, q4, q9
    496	__tbl		q3, q13, q8
    497	veor		q15, q5, q9
    498	__tbl		q4, q14, q8
    499	veor		q10, q6, q9
    500	__tbl		q5, q15, q8
    501	veor		q11, q7, q9
    502	__tbl		q6, q10, q8
    503	__tbl		q7, q11, q8
    504
    505	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
    506
    507	sub		rounds, rounds, #1
    508	b		.Lenc_sbox
    509
    510	.align		5
    511SR:	.quad		0x0504070600030201, 0x0f0e0d0c0a09080b
    512SRM0:	.quad		0x0304090e00050a0f, 0x01060b0c0207080d
    513
    514.Lenc_last:
    515	__ldr		q12, SRM0
    516.Lenc_loop:
    517	shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
    518.Lenc_sbox:
    519	sbox		q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
    520								q13, q14, q15
    521	subs		rounds, rounds, #1
    522	bcc		.Lenc_done
    523
    524	mix_cols	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
    525								q13, q14, q15
    526
    527	beq		.Lenc_last
    528	__ldr		q12, SR
    529	b		.Lenc_loop
    530
    531.Lenc_done:
    532	vld1.8		{q12}, [bskey, :128]	// last round key
    533
    534	bitslice	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
    535
    536	veor		q0, q0, q12
    537	veor		q1, q1, q12
    538	veor		q4, q4, q12
    539	veor		q6, q6, q12
    540	veor		q3, q3, q12
    541	veor		q7, q7, q12
    542	veor		q2, q2, q12
    543	veor		q5, q5, q12
    544	bx		lr
    545ENDPROC(aesbs_encrypt8)
    546
    547	.align		4
    548M0ISR:	.quad		0x0a0e0206070b0f03, 0x0004080c0d010509
    549
    550aesbs_decrypt8:
    551	add		bskey, bskey, rounds, lsl #7
    552	sub		bskey, bskey, #112
    553	vld1.8		{q9}, [bskey, :128]	// round 0 key
    554	sub		bskey, bskey, #128
    555	__ldr		q8, M0ISR
    556
    557	veor		q10, q0, q9		// xor with round0 key
    558	veor		q11, q1, q9
    559	__tbl		q0, q10, q8
    560	veor		q12, q2, q9
    561	__tbl		q1, q11, q8
    562	veor		q13, q3, q9
    563	__tbl		q2, q12, q8
    564	veor		q14, q4, q9
    565	__tbl		q3, q13, q8
    566	veor		q15, q5, q9
    567	__tbl		q4, q14, q8
    568	veor		q10, q6, q9
    569	__tbl		q5, q15, q8
    570	veor		q11, q7, q9
    571	__tbl		q6, q10, q8
    572	__tbl		q7, q11, q8
    573
    574	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
    575
    576	sub		rounds, rounds, #1
    577	b		.Ldec_sbox
    578
    579	.align		5
    580ISR:	.quad		0x0504070602010003, 0x0f0e0d0c080b0a09
    581ISRM0:	.quad		0x01040b0e0205080f, 0x0306090c00070a0d
    582
    583.Ldec_last:
    584	__ldr		q12, ISRM0
    585.Ldec_loop:
    586	inv_shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
    587.Ldec_sbox:
    588	inv_sbox	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
    589								q13, q14, q15
    590	subs		rounds, rounds, #1
    591	bcc		.Ldec_done
    592
    593	inv_mix_cols	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
    594								q13, q14, q15
    595
    596	beq		.Ldec_last
    597	__ldr		q12, ISR
    598	b		.Ldec_loop
    599
    600.Ldec_done:
    601	add		bskey, bskey, #112
    602	vld1.8		{q12}, [bskey, :128]	// last round key
    603
    604	bitslice	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
    605
    606	veor		q0, q0, q12
    607	veor		q1, q1, q12
    608	veor		q6, q6, q12
    609	veor		q4, q4, q12
    610	veor		q2, q2, q12
    611	veor		q7, q7, q12
    612	veor		q3, q3, q12
    613	veor		q5, q5, q12
    614	bx		lr
    615ENDPROC(aesbs_decrypt8)
    616
    617	/*
    618	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    619	 *		     int blocks)
    620	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    621	 *		     int blocks)
    622	 */
    623	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
    624	push		{r4-r6, lr}
    625	ldr		r5, [sp, #16]		// number of blocks
    626
    62799:	adr		ip, 0f
    628	and		lr, r5, #7
    629	cmp		r5, #8
    630	sub		ip, ip, lr, lsl #2
    631	movlt		pc, ip			// computed goto if blocks < 8
    632
    633	vld1.8		{q0}, [r1]!
    634	vld1.8		{q1}, [r1]!
    635	vld1.8		{q2}, [r1]!
    636	vld1.8		{q3}, [r1]!
    637	vld1.8		{q4}, [r1]!
    638	vld1.8		{q5}, [r1]!
    639	vld1.8		{q6}, [r1]!
    640	vld1.8		{q7}, [r1]!
    641
    6420:	mov		bskey, r2
    643	mov		rounds, r3
    644	bl		\do8
    645
    646	adr		ip, 1f
    647	and		lr, r5, #7
    648	cmp		r5, #8
    649	sub		ip, ip, lr, lsl #2
    650	movlt		pc, ip			// computed goto if blocks < 8
    651
    652	vst1.8		{\o0}, [r0]!
    653	vst1.8		{\o1}, [r0]!
    654	vst1.8		{\o2}, [r0]!
    655	vst1.8		{\o3}, [r0]!
    656	vst1.8		{\o4}, [r0]!
    657	vst1.8		{\o5}, [r0]!
    658	vst1.8		{\o6}, [r0]!
    659	vst1.8		{\o7}, [r0]!
    660
    6611:	subs		r5, r5, #8
    662	bgt		99b
    663
    664	pop		{r4-r6, pc}
    665	.endm
    666
    667	.align		4
    668ENTRY(aesbs_ecb_encrypt)
    669	__ecb_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
    670ENDPROC(aesbs_ecb_encrypt)
    671
    672	.align		4
    673ENTRY(aesbs_ecb_decrypt)
    674	__ecb_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
    675ENDPROC(aesbs_ecb_decrypt)
    676
    677	/*
    678	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
    679	 *		     int rounds, int blocks, u8 iv[])
    680	 */
    681	.align		4
    682ENTRY(aesbs_cbc_decrypt)
    683	mov		ip, sp
    684	push		{r4-r6, lr}
    685	ldm		ip, {r5-r6}		// load args 4-5
    686
    68799:	adr		ip, 0f
    688	and		lr, r5, #7
    689	cmp		r5, #8
    690	sub		ip, ip, lr, lsl #2
    691	mov		lr, r1
    692	movlt		pc, ip			// computed goto if blocks < 8
    693
    694	vld1.8		{q0}, [lr]!
    695	vld1.8		{q1}, [lr]!
    696	vld1.8		{q2}, [lr]!
    697	vld1.8		{q3}, [lr]!
    698	vld1.8		{q4}, [lr]!
    699	vld1.8		{q5}, [lr]!
    700	vld1.8		{q6}, [lr]!
    701	vld1.8		{q7}, [lr]
    702
    7030:	mov		bskey, r2
    704	mov		rounds, r3
    705	bl		aesbs_decrypt8
    706
    707	vld1.8		{q8}, [r6]
    708	vmov		q9, q8
    709	vmov		q10, q8
    710	vmov		q11, q8
    711	vmov		q12, q8
    712	vmov		q13, q8
    713	vmov		q14, q8
    714	vmov		q15, q8
    715
    716	adr		ip, 1f
    717	and		lr, r5, #7
    718	cmp		r5, #8
    719	sub		ip, ip, lr, lsl #2
    720	movlt		pc, ip			// computed goto if blocks < 8
    721
    722	vld1.8		{q9}, [r1]!
    723	vld1.8		{q10}, [r1]!
    724	vld1.8		{q11}, [r1]!
    725	vld1.8		{q12}, [r1]!
    726	vld1.8		{q13}, [r1]!
    727	vld1.8		{q14}, [r1]!
    728	vld1.8		{q15}, [r1]!
    729	W(nop)
    730
    7311:	adr		ip, 2f
    732	sub		ip, ip, lr, lsl #3
    733	movlt		pc, ip			// computed goto if blocks < 8
    734
    735	veor		q0, q0, q8
    736	vst1.8		{q0}, [r0]!
    737	veor		q1, q1, q9
    738	vst1.8		{q1}, [r0]!
    739	veor		q6, q6, q10
    740	vst1.8		{q6}, [r0]!
    741	veor		q4, q4, q11
    742	vst1.8		{q4}, [r0]!
    743	veor		q2, q2, q12
    744	vst1.8		{q2}, [r0]!
    745	veor		q7, q7, q13
    746	vst1.8		{q7}, [r0]!
    747	veor		q3, q3, q14
    748	vst1.8		{q3}, [r0]!
    749	veor		q5, q5, q15
    750	vld1.8		{q8}, [r1]!		// load next round's iv
    7512:	vst1.8		{q5}, [r0]!
    752
    753	subs		r5, r5, #8
    754	vst1.8		{q8}, [r6]		// store next round's iv
    755	bgt		99b
    756
    757	pop		{r4-r6, pc}
    758ENDPROC(aesbs_cbc_decrypt)
    759
    760	.macro		next_ctr, q
    761	vmov		\q\()h, r9, r10
    762	adds		r10, r10, #1
    763	adcs		r9, r9, #0
    764	vmov		\q\()l, r7, r8
    765	adcs		r8, r8, #0
    766	adc		r7, r7, #0
    767	vrev32.8	\q, \q
    768	.endm
    769
    770	/*
    771	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
    772	 *		     int rounds, int bytes, u8 ctr[])
    773	 */
    774ENTRY(aesbs_ctr_encrypt)
    775	mov		ip, sp
    776	push		{r4-r10, lr}
    777
    778	ldm		ip, {r5, r6}		// load args 4-5
    779	vld1.8		{q0}, [r6]		// load counter
    780	vrev32.8	q1, q0
    781	vmov		r9, r10, d3
    782	vmov		r7, r8, d2
    783
    784	adds		r10, r10, #1
    785	adcs		r9, r9, #0
    786	adcs		r8, r8, #0
    787	adc		r7, r7, #0
    788
    78999:	vmov		q1, q0
    790	sub		lr, r5, #1
    791	vmov		q2, q0
    792	adr		ip, 0f
    793	vmov		q3, q0
    794	and		lr, lr, #112
    795	vmov		q4, q0
    796	cmp		r5, #112
    797	vmov		q5, q0
    798	sub		ip, ip, lr, lsl #1
    799	vmov		q6, q0
    800	add		ip, ip, lr, lsr #2
    801	vmov		q7, q0
    802	movle		pc, ip			// computed goto if bytes < 112
    803
    804	next_ctr	q1
    805	next_ctr	q2
    806	next_ctr	q3
    807	next_ctr	q4
    808	next_ctr	q5
    809	next_ctr	q6
    810	next_ctr	q7
    811
    8120:	mov		bskey, r2
    813	mov		rounds, r3
    814	bl		aesbs_encrypt8
    815
    816	adr		ip, 1f
    817	sub		lr, r5, #1
    818	cmp		r5, #128
    819	bic		lr, lr, #15
    820	ands		r4, r5, #15		// preserves C flag
    821	teqcs		r5, r5			// set Z flag if not last iteration
    822	sub		ip, ip, lr, lsr #2
    823	rsb		r4, r4, #16
    824	movcc		pc, ip			// computed goto if bytes < 128
    825
    826	vld1.8		{q8}, [r1]!
    827	vld1.8		{q9}, [r1]!
    828	vld1.8		{q10}, [r1]!
    829	vld1.8		{q11}, [r1]!
    830	vld1.8		{q12}, [r1]!
    831	vld1.8		{q13}, [r1]!
    832	vld1.8		{q14}, [r1]!
    8331:	subne		r1, r1, r4
    834	vld1.8		{q15}, [r1]!
    835
    836	add		ip, ip, #2f - 1b
    837
    838	veor		q0, q0, q8
    839	veor		q1, q1, q9
    840	veor		q4, q4, q10
    841	veor		q6, q6, q11
    842	veor		q3, q3, q12
    843	veor		q7, q7, q13
    844	veor		q2, q2, q14
    845	bne		3f
    846	veor		q5, q5, q15
    847
    848	movcc		pc, ip			// computed goto if bytes < 128
    849
    850	vst1.8		{q0}, [r0]!
    851	vst1.8		{q1}, [r0]!
    852	vst1.8		{q4}, [r0]!
    853	vst1.8		{q6}, [r0]!
    854	vst1.8		{q3}, [r0]!
    855	vst1.8		{q7}, [r0]!
    856	vst1.8		{q2}, [r0]!
    8572:	subne		r0, r0, r4
    858	vst1.8		{q5}, [r0]!
    859
    860	next_ctr	q0
    861
    862	subs		r5, r5, #128
    863	bgt		99b
    864
    865	vst1.8		{q0}, [r6]
    866	pop		{r4-r10, pc}
    867
    8683:	adr		lr, .Lpermute_table + 16
    869	cmp		r5, #16			// Z flag remains cleared
    870	sub		lr, lr, r4
    871	vld1.8		{q8-q9}, [lr]
    872	vtbl.8		d16, {q5}, d16
    873	vtbl.8		d17, {q5}, d17
    874	veor		q5, q8, q15
    875	bcc		4f			// have to reload prev if R5 < 16
    876	vtbx.8		d10, {q2}, d18
    877	vtbx.8		d11, {q2}, d19
    878	mov		pc, ip			// branch back to VST sequence
    879
    8804:	sub		r0, r0, r4
    881	vshr.s8		q9, q9, #7		// create mask for VBIF
    882	vld1.8		{q8}, [r0]		// reload
    883	vbif		q5, q8, q9
    884	vst1.8		{q5}, [r0]
    885	pop		{r4-r10, pc}
    886ENDPROC(aesbs_ctr_encrypt)
    887
    888	.align		6
    889.Lpermute_table:
    890	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    891	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    892	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
    893	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
    894	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    895	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
    896
    897	.macro		next_tweak, out, in, const, tmp
    898	vshr.s64	\tmp, \in, #63
    899	vand		\tmp, \tmp, \const
    900	vadd.u64	\out, \in, \in
    901	vext.8		\tmp, \tmp, \tmp, #8
    902	veor		\out, \out, \tmp
    903	.endm
    904
    905	/*
    906	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    907	 *		     int blocks, u8 iv[], int reorder_last_tweak)
    908	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    909	 *		     int blocks, u8 iv[], int reorder_last_tweak)
    910	 */
    911	.align		6
    912__xts_prepare8:
    913	vld1.8		{q14}, [r7]		// load iv
    914	vmov.i32	d30, #0x87		// compose tweak mask vector
    915	vmovl.u32	q15, d30
    916	vshr.u64	d30, d31, #7
    917	vmov		q12, q14
    918
    919	adr		ip, 0f
    920	and		r4, r6, #7
    921	cmp		r6, #8
    922	sub		ip, ip, r4, lsl #5
    923	mov		r4, sp
    924	movlt		pc, ip			// computed goto if blocks < 8
    925
    926	vld1.8		{q0}, [r1]!
    927	next_tweak	q12, q14, q15, q13
    928	veor		q0, q0, q14
    929	vst1.8		{q14}, [r4, :128]!
    930
    931	vld1.8		{q1}, [r1]!
    932	next_tweak	q14, q12, q15, q13
    933	veor		q1, q1, q12
    934	vst1.8		{q12}, [r4, :128]!
    935
    936	vld1.8		{q2}, [r1]!
    937	next_tweak	q12, q14, q15, q13
    938	veor		q2, q2, q14
    939	vst1.8		{q14}, [r4, :128]!
    940
    941	vld1.8		{q3}, [r1]!
    942	next_tweak	q14, q12, q15, q13
    943	veor		q3, q3, q12
    944	vst1.8		{q12}, [r4, :128]!
    945
    946	vld1.8		{q4}, [r1]!
    947	next_tweak	q12, q14, q15, q13
    948	veor		q4, q4, q14
    949	vst1.8		{q14}, [r4, :128]!
    950
    951	vld1.8		{q5}, [r1]!
    952	next_tweak	q14, q12, q15, q13
    953	veor		q5, q5, q12
    954	vst1.8		{q12}, [r4, :128]!
    955
    956	vld1.8		{q6}, [r1]!
    957	next_tweak	q12, q14, q15, q13
    958	veor		q6, q6, q14
    959	vst1.8		{q14}, [r4, :128]!
    960
    961	vld1.8		{q7}, [r1]!
    962	next_tweak	q14, q12, q15, q13
    963THUMB(	itt		le		)
    964	W(cmple)	r8, #0
    965	ble		1f
    9660:	veor		q7, q7, q12
    967	vst1.8		{q12}, [r4, :128]
    968
    969	vst1.8		{q14}, [r7]		// store next iv
    970	bx		lr
    971
    9721:	vswp		q12, q14
    973	b		0b
    974ENDPROC(__xts_prepare8)
    975
    976	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
    977	push		{r4-r8, lr}
    978	mov		r5, sp			// preserve sp
    979	ldrd		r6, r7, [sp, #24]	// get blocks and iv args
    980	rsb		r8, ip, #1
    981	sub		ip, sp, #128		// make room for 8x tweak
    982	bic		ip, ip, #0xf		// align sp to 16 bytes
    983	mov		sp, ip
    984
    98599:	bl		__xts_prepare8
    986
    987	mov		bskey, r2
    988	mov		rounds, r3
    989	bl		\do8
    990
    991	adr		ip, 0f
    992	and		lr, r6, #7
    993	cmp		r6, #8
    994	sub		ip, ip, lr, lsl #2
    995	mov		r4, sp
    996	movlt		pc, ip			// computed goto if blocks < 8
    997
    998	vld1.8		{q8}, [r4, :128]!
    999	vld1.8		{q9}, [r4, :128]!
   1000	vld1.8		{q10}, [r4, :128]!
   1001	vld1.8		{q11}, [r4, :128]!
   1002	vld1.8		{q12}, [r4, :128]!
   1003	vld1.8		{q13}, [r4, :128]!
   1004	vld1.8		{q14}, [r4, :128]!
   1005	vld1.8		{q15}, [r4, :128]
   1006
   10070:	adr		ip, 1f
   1008	sub		ip, ip, lr, lsl #3
   1009	movlt		pc, ip			// computed goto if blocks < 8
   1010
   1011	veor		\o0, \o0, q8
   1012	vst1.8		{\o0}, [r0]!
   1013	veor		\o1, \o1, q9
   1014	vst1.8		{\o1}, [r0]!
   1015	veor		\o2, \o2, q10
   1016	vst1.8		{\o2}, [r0]!
   1017	veor		\o3, \o3, q11
   1018	vst1.8		{\o3}, [r0]!
   1019	veor		\o4, \o4, q12
   1020	vst1.8		{\o4}, [r0]!
   1021	veor		\o5, \o5, q13
   1022	vst1.8		{\o5}, [r0]!
   1023	veor		\o6, \o6, q14
   1024	vst1.8		{\o6}, [r0]!
   1025	veor		\o7, \o7, q15
   1026	vst1.8		{\o7}, [r0]!
   1027
   10281:	subs		r6, r6, #8
   1029	bgt		99b
   1030
   1031	mov		sp, r5
   1032	pop		{r4-r8, pc}
   1033	.endm
   1034
   1035ENTRY(aesbs_xts_encrypt)
   1036	mov		ip, #0			// never reorder final tweak
   1037	__xts_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
   1038ENDPROC(aesbs_xts_encrypt)
   1039
   1040ENTRY(aesbs_xts_decrypt)
   1041	ldr		ip, [sp, #8]		// reorder final tweak?
   1042	__xts_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
   1043ENDPROC(aesbs_xts_decrypt)