cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aes-neonbs-core.S (22158B)


      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Bit sliced AES using NEON instructions
      4 *
      5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
      6 */
      7
      8/*
      9 * The algorithm implemented here is described in detail by the paper
     10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
     11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
     12 *
     13 * This implementation is based primarily on the OpenSSL implementation
     14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
     15 */
     16
     17#include <linux/linkage.h>
     18#include <asm/assembler.h>
     19
     20	.text
     21
     22	rounds		.req	x11
     23	bskey		.req	x12
     24
     25	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
     26	eor		\b2, \b2, \b1
     27	eor		\b5, \b5, \b6
     28	eor		\b3, \b3, \b0
     29	eor		\b6, \b6, \b2
     30	eor		\b5, \b5, \b0
     31	eor		\b6, \b6, \b3
     32	eor		\b3, \b3, \b7
     33	eor		\b7, \b7, \b5
     34	eor		\b3, \b3, \b4
     35	eor		\b4, \b4, \b5
     36	eor		\b2, \b2, \b7
     37	eor		\b3, \b3, \b1
     38	eor		\b1, \b1, \b5
     39	.endm
     40
     41	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
     42	eor		\b0, \b0, \b6
     43	eor		\b1, \b1, \b4
     44	eor		\b4, \b4, \b6
     45	eor		\b2, \b2, \b0
     46	eor		\b6, \b6, \b1
     47	eor		\b1, \b1, \b5
     48	eor		\b5, \b5, \b3
     49	eor		\b3, \b3, \b7
     50	eor		\b7, \b7, \b5
     51	eor		\b2, \b2, \b5
     52	eor		\b4, \b4, \b7
     53	.endm
     54
     55	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
     56	eor		\b1, \b1, \b7
     57	eor		\b4, \b4, \b7
     58	eor		\b7, \b7, \b5
     59	eor		\b1, \b1, \b3
     60	eor		\b2, \b2, \b5
     61	eor		\b3, \b3, \b7
     62	eor		\b6, \b6, \b1
     63	eor		\b2, \b2, \b0
     64	eor		\b5, \b5, \b3
     65	eor		\b4, \b4, \b6
     66	eor		\b0, \b0, \b6
     67	eor		\b1, \b1, \b4
     68	.endm
     69
     70	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
     71	eor		\b1, \b1, \b5
     72	eor		\b2, \b2, \b7
     73	eor		\b3, \b3, \b1
     74	eor		\b4, \b4, \b5
     75	eor		\b7, \b7, \b5
     76	eor		\b3, \b3, \b4
     77	eor 		\b5, \b5, \b0
     78	eor		\b3, \b3, \b7
     79	eor		\b6, \b6, \b2
     80	eor		\b2, \b2, \b1
     81	eor		\b6, \b6, \b3
     82	eor		\b3, \b3, \b0
     83	eor		\b5, \b5, \b6
     84	.endm
     85
     86	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
     87	eor 		\t0, \y0, \y1
     88	and		\t0, \t0, \x0
     89	eor		\x0, \x0, \x1
     90	and		\t1, \x1, \y0
     91	and		\x0, \x0, \y1
     92	eor		\x1, \t1, \t0
     93	eor		\x0, \x0, \t1
     94	.endm
     95
     96	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
     97	eor		\t0, \y0, \y1
     98	eor 		\t1, \y2, \y3
     99	and		\t0, \t0, \x0
    100	and		\t1, \t1, \x2
    101	eor		\x0, \x0, \x1
    102	eor		\x2, \x2, \x3
    103	and		\x1, \x1, \y0
    104	and		\x3, \x3, \y2
    105	and		\x0, \x0, \y1
    106	and		\x2, \x2, \y3
    107	eor		\x1, \x1, \x0
    108	eor		\x2, \x2, \x3
    109	eor		\x0, \x0, \t0
    110	eor		\x3, \x3, \t1
    111	.endm
    112
    113	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
    114				    y0, y1, y2, y3, t0, t1, t2, t3
    115	eor		\t0, \x0, \x2
    116	eor		\t1, \x1, \x3
    117	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
    118	eor		\y0, \y0, \y2
    119	eor		\y1, \y1, \y3
    120	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
    121	eor		\x0, \x0, \t0
    122	eor		\x2, \x2, \t0
    123	eor		\x1, \x1, \t1
    124	eor		\x3, \x3, \t1
    125	eor		\t0, \x4, \x6
    126	eor		\t1, \x5, \x7
    127	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
    128	eor		\y0, \y0, \y2
    129	eor		\y1, \y1, \y3
    130	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
    131	eor		\x4, \x4, \t0
    132	eor		\x6, \x6, \t0
    133	eor		\x5, \x5, \t1
    134	eor		\x7, \x7, \t1
    135	.endm
    136
    137	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
    138				   t0, t1, t2, t3, s0, s1, s2, s3
    139	eor		\t3, \x4, \x6
    140	eor		\t0, \x5, \x7
    141	eor		\t1, \x1, \x3
    142	eor		\s1, \x7, \x6
    143	eor		\s0, \x0, \x2
    144	eor		\s3, \t3, \t0
    145	orr		\t2, \t0, \t1
    146	and		\s2, \t3, \s0
    147	orr		\t3, \t3, \s0
    148	eor		\s0, \s0, \t1
    149	and		\t0, \t0, \t1
    150	eor		\t1, \x3, \x2
    151	and		\s3, \s3, \s0
    152	and		\s1, \s1, \t1
    153	eor		\t1, \x4, \x5
    154	eor		\s0, \x1, \x0
    155	eor		\t3, \t3, \s1
    156	eor		\t2, \t2, \s1
    157	and		\s1, \t1, \s0
    158	orr		\t1, \t1, \s0
    159	eor		\t3, \t3, \s3
    160	eor		\t0, \t0, \s1
    161	eor		\t2, \t2, \s2
    162	eor		\t1, \t1, \s3
    163	eor		\t0, \t0, \s2
    164	and		\s0, \x7, \x3
    165	eor		\t1, \t1, \s2
    166	and		\s1, \x6, \x2
    167	and		\s2, \x5, \x1
    168	orr		\s3, \x4, \x0
    169	eor		\t3, \t3, \s0
    170	eor		\t1, \t1, \s2
    171	eor		\s0, \t0, \s3
    172	eor		\t2, \t2, \s1
    173	and		\s2, \t3, \t1
    174	eor		\s1, \t2, \s2
    175	eor		\s3, \s0, \s2
    176	bsl		\s1, \t1, \s0
    177	not		\t0, \s0
    178	bsl		\s0, \s1, \s3
    179	bsl		\t0, \s1, \s3
    180	bsl		\s3, \t3, \t2
    181	eor		\t3, \t3, \t2
    182	and		\s2, \s0, \s3
    183	eor		\t1, \t1, \t0
    184	eor		\s2, \s2, \t3
    185	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
    186			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
    187	.endm
    188
    189	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
    190			      t0, t1, t2, t3, s0, s1, s2, s3
    191	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
    192			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
    193	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
    194			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
    195			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
    196			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
    197	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
    198			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
    199	.endm
    200
    201	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
    202				  t0, t1, t2, t3, s0, s1, s2, s3
    203	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
    204			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
    205	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
    206			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
    207			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
    208			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
    209	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
    210			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
    211	.endm
    212
    213	.macro		enc_next_rk
    214	ldp		q16, q17, [bskey], #128
    215	ldp		q18, q19, [bskey, #-96]
    216	ldp		q20, q21, [bskey, #-64]
    217	ldp		q22, q23, [bskey, #-32]
    218	.endm
    219
    220	.macro		dec_next_rk
    221	ldp		q16, q17, [bskey, #-128]!
    222	ldp		q18, q19, [bskey, #32]
    223	ldp		q20, q21, [bskey, #64]
    224	ldp		q22, q23, [bskey, #96]
    225	.endm
    226
    227	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
    228	eor		\x0\().16b, \x0\().16b, v16.16b
    229	eor		\x1\().16b, \x1\().16b, v17.16b
    230	eor		\x2\().16b, \x2\().16b, v18.16b
    231	eor		\x3\().16b, \x3\().16b, v19.16b
    232	eor		\x4\().16b, \x4\().16b, v20.16b
    233	eor		\x5\().16b, \x5\().16b, v21.16b
    234	eor		\x6\().16b, \x6\().16b, v22.16b
    235	eor		\x7\().16b, \x7\().16b, v23.16b
    236	.endm
    237
    238	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
    239	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
    240	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
    241	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
    242	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
    243	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
    244	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
    245	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
    246	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
    247	.endm
    248
    249	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
    250				  t0, t1, t2, t3, t4, t5, t6, t7, inv
    251	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
    252	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
    253	eor		\x0\().16b, \x0\().16b, \t0\().16b
    254	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
    255	eor		\x1\().16b, \x1\().16b, \t1\().16b
    256	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
    257	eor		\x2\().16b, \x2\().16b, \t2\().16b
    258	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
    259	eor		\x3\().16b, \x3\().16b, \t3\().16b
    260	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
    261	eor		\x4\().16b, \x4\().16b, \t4\().16b
    262	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
    263	eor		\x5\().16b, \x5\().16b, \t5\().16b
    264	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
    265	eor		\x6\().16b, \x6\().16b, \t6\().16b
    266	eor		\t1\().16b, \t1\().16b, \x0\().16b
    267	eor		\x7\().16b, \x7\().16b, \t7\().16b
    268	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
    269	eor		\t2\().16b, \t2\().16b, \x1\().16b
    270	eor		\t0\().16b, \t0\().16b, \x7\().16b
    271	eor		\t1\().16b, \t1\().16b, \x7\().16b
    272	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
    273	eor		\t5\().16b, \t5\().16b, \x4\().16b
    274	eor		\x0\().16b, \x0\().16b, \t0\().16b
    275	eor		\t6\().16b, \t6\().16b, \x5\().16b
    276	eor		\x1\().16b, \x1\().16b, \t1\().16b
    277	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
    278	eor		\t4\().16b, \t4\().16b, \x3\().16b
    279	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
    280	eor		\t7\().16b, \t7\().16b, \x6\().16b
    281	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
    282	eor		\t3\().16b, \t3\().16b, \x2\().16b
    283	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
    284	eor		\t4\().16b, \t4\().16b, \x7\().16b
    285	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
    286	eor		\t3\().16b, \t3\().16b, \x7\().16b
    287	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
    288	eor		\x7\().16b, \t1\().16b, \t5\().16b
    289	.ifb		\inv
    290	eor		\x2\().16b, \t0\().16b, \t4\().16b
    291	eor		\x4\().16b, \x4\().16b, \t3\().16b
    292	eor		\x5\().16b, \x5\().16b, \t7\().16b
    293	eor		\x3\().16b, \x3\().16b, \t6\().16b
    294	eor		\x6\().16b, \x6\().16b, \t2\().16b
    295	.else
    296	eor		\t3\().16b, \t3\().16b, \x4\().16b
    297	eor		\x5\().16b, \x5\().16b, \t7\().16b
    298	eor		\x2\().16b, \x3\().16b, \t6\().16b
    299	eor		\x3\().16b, \t0\().16b, \t4\().16b
    300	eor		\x4\().16b, \x6\().16b, \t2\().16b
    301	mov		\x6\().16b, \t3\().16b
    302	.endif
    303	.endm
    304
    305	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
    306				      t0, t1, t2, t3, t4, t5, t6, t7
    307	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
    308	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
    309	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
    310	eor		\t0\().16b, \t0\().16b, \x0\().16b
    311	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
    312	eor		\t6\().16b, \t6\().16b, \x6\().16b
    313	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
    314	eor		\t7\().16b, \t7\().16b, \x7\().16b
    315	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
    316	eor		\t1\().16b, \t1\().16b, \x1\().16b
    317	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
    318	eor		\t2\().16b, \t2\().16b, \x2\().16b
    319	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
    320	eor		\t3\().16b, \t3\().16b, \x3\().16b
    321	eor		\t4\().16b, \t4\().16b, \x4\().16b
    322	eor		\t5\().16b, \t5\().16b, \x5\().16b
    323	eor		\x0\().16b, \x0\().16b, \t6\().16b
    324	eor		\x1\().16b, \x1\().16b, \t6\().16b
    325	eor		\x2\().16b, \x2\().16b, \t0\().16b
    326	eor		\x4\().16b, \x4\().16b, \t2\().16b
    327	eor		\x3\().16b, \x3\().16b, \t1\().16b
    328	eor		\x1\().16b, \x1\().16b, \t7\().16b
    329	eor		\x2\().16b, \x2\().16b, \t7\().16b
    330	eor		\x4\().16b, \x4\().16b, \t6\().16b
    331	eor		\x5\().16b, \x5\().16b, \t3\().16b
    332	eor		\x3\().16b, \x3\().16b, \t6\().16b
    333	eor		\x6\().16b, \x6\().16b, \t4\().16b
    334	eor		\x4\().16b, \x4\().16b, \t7\().16b
    335	eor		\x5\().16b, \x5\().16b, \t7\().16b
    336	eor		\x7\().16b, \x7\().16b, \t5\().16b
    337	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
    338			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
    339	.endm
    340
    341	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
    342	ushr		\t0\().2d, \b0\().2d, #\n
    343	ushr		\t1\().2d, \b1\().2d, #\n
    344	eor		\t0\().16b, \t0\().16b, \a0\().16b
    345	eor		\t1\().16b, \t1\().16b, \a1\().16b
    346	and		\t0\().16b, \t0\().16b, \mask\().16b
    347	and		\t1\().16b, \t1\().16b, \mask\().16b
    348	eor		\a0\().16b, \a0\().16b, \t0\().16b
    349	shl		\t0\().2d, \t0\().2d, #\n
    350	eor		\a1\().16b, \a1\().16b, \t1\().16b
    351	shl		\t1\().2d, \t1\().2d, #\n
    352	eor		\b0\().16b, \b0\().16b, \t0\().16b
    353	eor		\b1\().16b, \b1\().16b, \t1\().16b
    354	.endm
    355
    356	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
    357	movi		\t0\().16b, #0x55
    358	movi		\t1\().16b, #0x33
    359	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
    360	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
    361	movi		\t0\().16b, #0x0f
    362	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
    363	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
    364	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
    365	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
    366	.endm
    367
    368
    369	.align		6
    370M0:	.octa		0x0004080c0105090d02060a0e03070b0f
    371
    372M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
    373SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
    374SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
    375
    376M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
    377ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
    378ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
    379
    380	/*
    381	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
    382	 */
    383SYM_FUNC_START(aesbs_convert_key)
    384	ld1		{v7.4s}, [x1], #16		// load round 0 key
    385	ld1		{v17.4s}, [x1], #16		// load round 1 key
    386
    387	movi		v8.16b,  #0x01			// bit masks
    388	movi		v9.16b,  #0x02
    389	movi		v10.16b, #0x04
    390	movi		v11.16b, #0x08
    391	movi		v12.16b, #0x10
    392	movi		v13.16b, #0x20
    393	movi		v14.16b, #0x40
    394	movi		v15.16b, #0x80
    395	ldr		q16, M0
    396
    397	sub		x2, x2, #1
    398	str		q7, [x0], #16		// save round 0 key
    399
    400.Lkey_loop:
    401	tbl		v7.16b ,{v17.16b}, v16.16b
    402	ld1		{v17.4s}, [x1], #16		// load next round key
    403
    404	cmtst		v0.16b, v7.16b, v8.16b
    405	cmtst		v1.16b, v7.16b, v9.16b
    406	cmtst		v2.16b, v7.16b, v10.16b
    407	cmtst		v3.16b, v7.16b, v11.16b
    408	cmtst		v4.16b, v7.16b, v12.16b
    409	cmtst		v5.16b, v7.16b, v13.16b
    410	cmtst		v6.16b, v7.16b, v14.16b
    411	cmtst		v7.16b, v7.16b, v15.16b
    412	not		v0.16b, v0.16b
    413	not		v1.16b, v1.16b
    414	not		v5.16b, v5.16b
    415	not		v6.16b, v6.16b
    416
    417	subs		x2, x2, #1
    418	stp		q0, q1, [x0], #128
    419	stp		q2, q3, [x0, #-96]
    420	stp		q4, q5, [x0, #-64]
    421	stp		q6, q7, [x0, #-32]
    422	b.ne		.Lkey_loop
    423
    424	movi		v7.16b, #0x63			// compose .L63
    425	eor		v17.16b, v17.16b, v7.16b
    426	str		q17, [x0]
    427	ret
    428SYM_FUNC_END(aesbs_convert_key)
    429
    430	.align		4
    431SYM_FUNC_START_LOCAL(aesbs_encrypt8)
    432	ldr		q9, [bskey], #16		// round 0 key
    433	ldr		q8, M0SR
    434	ldr		q24, SR
    435
    436	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
    437	eor		v11.16b, v1.16b, v9.16b
    438	tbl		v0.16b, {v10.16b}, v8.16b
    439	eor		v12.16b, v2.16b, v9.16b
    440	tbl		v1.16b, {v11.16b}, v8.16b
    441	eor		v13.16b, v3.16b, v9.16b
    442	tbl		v2.16b, {v12.16b}, v8.16b
    443	eor		v14.16b, v4.16b, v9.16b
    444	tbl		v3.16b, {v13.16b}, v8.16b
    445	eor		v15.16b, v5.16b, v9.16b
    446	tbl		v4.16b, {v14.16b}, v8.16b
    447	eor		v10.16b, v6.16b, v9.16b
    448	tbl		v5.16b, {v15.16b}, v8.16b
    449	eor		v11.16b, v7.16b, v9.16b
    450	tbl		v6.16b, {v10.16b}, v8.16b
    451	tbl		v7.16b, {v11.16b}, v8.16b
    452
    453	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
    454
    455	sub		rounds, rounds, #1
    456	b		.Lenc_sbox
    457
    458.Lenc_loop:
    459	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
    460.Lenc_sbox:
    461	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
    462								v13, v14, v15
    463	subs		rounds, rounds, #1
    464	b.cc		.Lenc_done
    465
    466	enc_next_rk
    467
    468	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
    469								v13, v14, v15
    470
    471	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
    472
    473	b.ne		.Lenc_loop
    474	ldr		q24, SRM0
    475	b		.Lenc_loop
    476
    477.Lenc_done:
    478	ldr		q12, [bskey]			// last round key
    479
    480	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
    481
    482	eor		v0.16b, v0.16b, v12.16b
    483	eor		v1.16b, v1.16b, v12.16b
    484	eor		v4.16b, v4.16b, v12.16b
    485	eor		v6.16b, v6.16b, v12.16b
    486	eor		v3.16b, v3.16b, v12.16b
    487	eor		v7.16b, v7.16b, v12.16b
    488	eor		v2.16b, v2.16b, v12.16b
    489	eor		v5.16b, v5.16b, v12.16b
    490	ret
    491SYM_FUNC_END(aesbs_encrypt8)
    492
    493	.align		4
    494SYM_FUNC_START_LOCAL(aesbs_decrypt8)
    495	lsl		x9, rounds, #7
    496	add		bskey, bskey, x9
    497
    498	ldr		q9, [bskey, #-112]!		// round 0 key
    499	ldr		q8, M0ISR
    500	ldr		q24, ISR
    501
    502	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
    503	eor		v11.16b, v1.16b, v9.16b
    504	tbl		v0.16b, {v10.16b}, v8.16b
    505	eor		v12.16b, v2.16b, v9.16b
    506	tbl		v1.16b, {v11.16b}, v8.16b
    507	eor		v13.16b, v3.16b, v9.16b
    508	tbl		v2.16b, {v12.16b}, v8.16b
    509	eor		v14.16b, v4.16b, v9.16b
    510	tbl		v3.16b, {v13.16b}, v8.16b
    511	eor		v15.16b, v5.16b, v9.16b
    512	tbl		v4.16b, {v14.16b}, v8.16b
    513	eor		v10.16b, v6.16b, v9.16b
    514	tbl		v5.16b, {v15.16b}, v8.16b
    515	eor		v11.16b, v7.16b, v9.16b
    516	tbl		v6.16b, {v10.16b}, v8.16b
    517	tbl		v7.16b, {v11.16b}, v8.16b
    518
    519	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
    520
    521	sub		rounds, rounds, #1
    522	b		.Ldec_sbox
    523
    524.Ldec_loop:
    525	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
    526.Ldec_sbox:
    527	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
    528								v13, v14, v15
    529	subs		rounds, rounds, #1
    530	b.cc		.Ldec_done
    531
    532	dec_next_rk
    533
    534	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
    535
    536	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
    537								v13, v14, v15
    538
    539	b.ne		.Ldec_loop
    540	ldr		q24, ISRM0
    541	b		.Ldec_loop
    542.Ldec_done:
    543	ldr		q12, [bskey, #-16]		// last round key
    544
    545	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
    546
    547	eor		v0.16b, v0.16b, v12.16b
    548	eor		v1.16b, v1.16b, v12.16b
    549	eor		v6.16b, v6.16b, v12.16b
    550	eor		v4.16b, v4.16b, v12.16b
    551	eor		v2.16b, v2.16b, v12.16b
    552	eor		v7.16b, v7.16b, v12.16b
    553	eor		v3.16b, v3.16b, v12.16b
    554	eor		v5.16b, v5.16b, v12.16b
    555	ret
    556SYM_FUNC_END(aesbs_decrypt8)
    557
    558	/*
    559	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    560	 *		     int blocks)
    561	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    562	 *		     int blocks)
    563	 */
    564	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
    565	frame_push	5
    566
    567	mov		x19, x0
    568	mov		x20, x1
    569	mov		x21, x2
    570	mov		x22, x3
    571	mov		x23, x4
    572
    57399:	mov		x5, #1
    574	lsl		x5, x5, x23
    575	subs		w23, w23, #8
    576	csel		x23, x23, xzr, pl
    577	csel		x5, x5, xzr, mi
    578
    579	ld1		{v0.16b}, [x20], #16
    580	tbnz		x5, #1, 0f
    581	ld1		{v1.16b}, [x20], #16
    582	tbnz		x5, #2, 0f
    583	ld1		{v2.16b}, [x20], #16
    584	tbnz		x5, #3, 0f
    585	ld1		{v3.16b}, [x20], #16
    586	tbnz		x5, #4, 0f
    587	ld1		{v4.16b}, [x20], #16
    588	tbnz		x5, #5, 0f
    589	ld1		{v5.16b}, [x20], #16
    590	tbnz		x5, #6, 0f
    591	ld1		{v6.16b}, [x20], #16
    592	tbnz		x5, #7, 0f
    593	ld1		{v7.16b}, [x20], #16
    594
    5950:	mov		bskey, x21
    596	mov		rounds, x22
    597	bl		\do8
    598
    599	st1		{\o0\().16b}, [x19], #16
    600	tbnz		x5, #1, 1f
    601	st1		{\o1\().16b}, [x19], #16
    602	tbnz		x5, #2, 1f
    603	st1		{\o2\().16b}, [x19], #16
    604	tbnz		x5, #3, 1f
    605	st1		{\o3\().16b}, [x19], #16
    606	tbnz		x5, #4, 1f
    607	st1		{\o4\().16b}, [x19], #16
    608	tbnz		x5, #5, 1f
    609	st1		{\o5\().16b}, [x19], #16
    610	tbnz		x5, #6, 1f
    611	st1		{\o6\().16b}, [x19], #16
    612	tbnz		x5, #7, 1f
    613	st1		{\o7\().16b}, [x19], #16
    614
    615	cbz		x23, 1f
    616	b		99b
    617
    6181:	frame_pop
    619	ret
    620	.endm
    621
    622	.align		4
    623SYM_FUNC_START(aesbs_ecb_encrypt)
    624	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
    625SYM_FUNC_END(aesbs_ecb_encrypt)
    626
    627	.align		4
    628SYM_FUNC_START(aesbs_ecb_decrypt)
    629	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
    630SYM_FUNC_END(aesbs_ecb_decrypt)
    631
    632	/*
    633	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    634	 *		     int blocks, u8 iv[])
    635	 */
    636	.align		4
    637SYM_FUNC_START(aesbs_cbc_decrypt)
    638	frame_push	6
    639
    640	mov		x19, x0
    641	mov		x20, x1
    642	mov		x21, x2
    643	mov		x22, x3
    644	mov		x23, x4
    645	mov		x24, x5
    646
    64799:	mov		x6, #1
    648	lsl		x6, x6, x23
    649	subs		w23, w23, #8
    650	csel		x23, x23, xzr, pl
    651	csel		x6, x6, xzr, mi
    652
    653	ld1		{v0.16b}, [x20], #16
    654	mov		v25.16b, v0.16b
    655	tbnz		x6, #1, 0f
    656	ld1		{v1.16b}, [x20], #16
    657	mov		v26.16b, v1.16b
    658	tbnz		x6, #2, 0f
    659	ld1		{v2.16b}, [x20], #16
    660	mov		v27.16b, v2.16b
    661	tbnz		x6, #3, 0f
    662	ld1		{v3.16b}, [x20], #16
    663	mov		v28.16b, v3.16b
    664	tbnz		x6, #4, 0f
    665	ld1		{v4.16b}, [x20], #16
    666	mov		v29.16b, v4.16b
    667	tbnz		x6, #5, 0f
    668	ld1		{v5.16b}, [x20], #16
    669	mov		v30.16b, v5.16b
    670	tbnz		x6, #6, 0f
    671	ld1		{v6.16b}, [x20], #16
    672	mov		v31.16b, v6.16b
    673	tbnz		x6, #7, 0f
    674	ld1		{v7.16b}, [x20]
    675
    6760:	mov		bskey, x21
    677	mov		rounds, x22
    678	bl		aesbs_decrypt8
    679
    680	ld1		{v24.16b}, [x24]		// load IV
    681
    682	eor		v1.16b, v1.16b, v25.16b
    683	eor		v6.16b, v6.16b, v26.16b
    684	eor		v4.16b, v4.16b, v27.16b
    685	eor		v2.16b, v2.16b, v28.16b
    686	eor		v7.16b, v7.16b, v29.16b
    687	eor		v0.16b, v0.16b, v24.16b
    688	eor		v3.16b, v3.16b, v30.16b
    689	eor		v5.16b, v5.16b, v31.16b
    690
    691	st1		{v0.16b}, [x19], #16
    692	mov		v24.16b, v25.16b
    693	tbnz		x6, #1, 1f
    694	st1		{v1.16b}, [x19], #16
    695	mov		v24.16b, v26.16b
    696	tbnz		x6, #2, 1f
    697	st1		{v6.16b}, [x19], #16
    698	mov		v24.16b, v27.16b
    699	tbnz		x6, #3, 1f
    700	st1		{v4.16b}, [x19], #16
    701	mov		v24.16b, v28.16b
    702	tbnz		x6, #4, 1f
    703	st1		{v2.16b}, [x19], #16
    704	mov		v24.16b, v29.16b
    705	tbnz		x6, #5, 1f
    706	st1		{v7.16b}, [x19], #16
    707	mov		v24.16b, v30.16b
    708	tbnz		x6, #6, 1f
    709	st1		{v3.16b}, [x19], #16
    710	mov		v24.16b, v31.16b
    711	tbnz		x6, #7, 1f
    712	ld1		{v24.16b}, [x20], #16
    713	st1		{v5.16b}, [x19], #16
    7141:	st1		{v24.16b}, [x24]		// store IV
    715
    716	cbz		x23, 2f
    717	b		99b
    718
    7192:	frame_pop
    720	ret
    721SYM_FUNC_END(aesbs_cbc_decrypt)
    722
    723	.macro		next_tweak, out, in, const, tmp
    724	sshr		\tmp\().2d,  \in\().2d,   #63
    725	and		\tmp\().16b, \tmp\().16b, \const\().16b
    726	add		\out\().2d,  \in\().2d,   \in\().2d
    727	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
    728	eor		\out\().16b, \out\().16b, \tmp\().16b
    729	.endm
    730
    731	/*
    732	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    733	 *		     int blocks, u8 iv[])
    734	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
    735	 *		     int blocks, u8 iv[])
    736	 */
    737SYM_FUNC_START_LOCAL(__xts_crypt8)
    738	movi		v18.2s, #0x1
    739	movi		v19.2s, #0x87
    740	uzp1		v18.4s, v18.4s, v19.4s
    741
    742	ld1		{v0.16b-v3.16b}, [x1], #64
    743	ld1		{v4.16b-v7.16b}, [x1], #64
    744
    745	next_tweak	v26, v25, v18, v19
    746	next_tweak	v27, v26, v18, v19
    747	next_tweak	v28, v27, v18, v19
    748	next_tweak	v29, v28, v18, v19
    749	next_tweak	v30, v29, v18, v19
    750	next_tweak	v31, v30, v18, v19
    751	next_tweak	v16, v31, v18, v19
    752	next_tweak	v17, v16, v18, v19
    753
    754	eor		v0.16b, v0.16b, v25.16b
    755	eor		v1.16b, v1.16b, v26.16b
    756	eor		v2.16b, v2.16b, v27.16b
    757	eor		v3.16b, v3.16b, v28.16b
    758	eor		v4.16b, v4.16b, v29.16b
    759	eor		v5.16b, v5.16b, v30.16b
    760	eor		v6.16b, v6.16b, v31.16b
    761	eor		v7.16b, v7.16b, v16.16b
    762
    763	stp		q16, q17, [sp, #16]
    764
    765	mov		bskey, x2
    766	mov		rounds, x3
    767	br		x16
    768SYM_FUNC_END(__xts_crypt8)
    769
    770	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
    771	stp		x29, x30, [sp, #-48]!
    772	mov		x29, sp
    773
    774	ld1		{v25.16b}, [x5]
    775
    7760:	adr		x16, \do8
    777	bl		__xts_crypt8
    778
    779	eor		v16.16b, \o0\().16b, v25.16b
    780	eor		v17.16b, \o1\().16b, v26.16b
    781	eor		v18.16b, \o2\().16b, v27.16b
    782	eor		v19.16b, \o3\().16b, v28.16b
    783
    784	ldp		q24, q25, [sp, #16]
    785
    786	eor		v20.16b, \o4\().16b, v29.16b
    787	eor		v21.16b, \o5\().16b, v30.16b
    788	eor		v22.16b, \o6\().16b, v31.16b
    789	eor		v23.16b, \o7\().16b, v24.16b
    790
    791	st1		{v16.16b-v19.16b}, [x0], #64
    792	st1		{v20.16b-v23.16b}, [x0], #64
    793
    794	subs		x4, x4, #8
    795	b.gt		0b
    796
    797	st1		{v25.16b}, [x5]
    798	ldp		x29, x30, [sp], #48
    799	ret
    800	.endm
    801
    802SYM_FUNC_START(aesbs_xts_encrypt)
    803	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
    804SYM_FUNC_END(aesbs_xts_encrypt)
    805
    806SYM_FUNC_START(aesbs_xts_decrypt)
    807	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
    808SYM_FUNC_END(aesbs_xts_decrypt)
    809
    810	.macro		next_ctr, v
    811	mov		\v\().d[1], x8
    812	adds		x8, x8, #1
    813	mov		\v\().d[0], x7
    814	adc		x7, x7, xzr
    815	rev64		\v\().16b, \v\().16b
    816	.endm
    817
    818	/*
    819	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
    820	 *		     int rounds, int blocks, u8 iv[])
    821	 */
    822SYM_FUNC_START(aesbs_ctr_encrypt)
    823	stp		x29, x30, [sp, #-16]!
    824	mov		x29, sp
    825
    826	ldp		x7, x8, [x5]
    827	ld1		{v0.16b}, [x5]
    828CPU_LE(	rev		x7, x7		)
    829CPU_LE(	rev		x8, x8		)
    830	adds		x8, x8, #1
    831	adc		x7, x7, xzr
    832
    8330:	next_ctr	v1
    834	next_ctr	v2
    835	next_ctr	v3
    836	next_ctr	v4
    837	next_ctr	v5
    838	next_ctr	v6
    839	next_ctr	v7
    840
    841	mov		bskey, x2
    842	mov		rounds, x3
    843	bl		aesbs_encrypt8
    844
    845	ld1		{ v8.16b-v11.16b}, [x1], #64
    846	ld1		{v12.16b-v15.16b}, [x1], #64
    847
    848	eor		v8.16b, v0.16b, v8.16b
    849	eor		v9.16b, v1.16b, v9.16b
    850	eor		v10.16b, v4.16b, v10.16b
    851	eor		v11.16b, v6.16b, v11.16b
    852	eor		v12.16b, v3.16b, v12.16b
    853	eor		v13.16b, v7.16b, v13.16b
    854	eor		v14.16b, v2.16b, v14.16b
    855	eor		v15.16b, v5.16b, v15.16b
    856
    857	st1		{ v8.16b-v11.16b}, [x0], #64
    858	st1		{v12.16b-v15.16b}, [x0], #64
    859
    860	next_ctr	v0
    861	subs		x4, x4, #8
    862	b.gt		0b
    863
    864	st1		{v0.16b}, [x5]
    865	ldp		x29, x30, [sp], #16
    866	ret
    867SYM_FUNC_END(aesbs_ctr_encrypt)