aes-neonbs-core.S (22158B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Bit sliced AES using NEON instructions 4 * 5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8/* 9 * The algorithm implemented here is described in detail by the paper 10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 12 * 13 * This implementation is based primarily on the OpenSSL implementation 14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 15 */ 16 17#include <linux/linkage.h> 18#include <asm/assembler.h> 19 20 .text 21 22 rounds .req x11 23 bskey .req x12 24 25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 26 eor \b2, \b2, \b1 27 eor \b5, \b5, \b6 28 eor \b3, \b3, \b0 29 eor \b6, \b6, \b2 30 eor \b5, \b5, \b0 31 eor \b6, \b6, \b3 32 eor \b3, \b3, \b7 33 eor \b7, \b7, \b5 34 eor \b3, \b3, \b4 35 eor \b4, \b4, \b5 36 eor \b2, \b2, \b7 37 eor \b3, \b3, \b1 38 eor \b1, \b1, \b5 39 .endm 40 41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 42 eor \b0, \b0, \b6 43 eor \b1, \b1, \b4 44 eor \b4, \b4, \b6 45 eor \b2, \b2, \b0 46 eor \b6, \b6, \b1 47 eor \b1, \b1, \b5 48 eor \b5, \b5, \b3 49 eor \b3, \b3, \b7 50 eor \b7, \b7, \b5 51 eor \b2, \b2, \b5 52 eor \b4, \b4, \b7 53 .endm 54 55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 56 eor \b1, \b1, \b7 57 eor \b4, \b4, \b7 58 eor \b7, \b7, \b5 59 eor \b1, \b1, \b3 60 eor \b2, \b2, \b5 61 eor \b3, \b3, \b7 62 eor \b6, \b6, \b1 63 eor \b2, \b2, \b0 64 eor \b5, \b5, \b3 65 eor \b4, \b4, \b6 66 eor \b0, \b0, \b6 67 eor \b1, \b1, \b4 68 .endm 69 70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 71 eor \b1, \b1, \b5 72 eor \b2, \b2, \b7 73 eor \b3, \b3, \b1 74 eor \b4, \b4, \b5 75 eor \b7, \b7, \b5 76 eor \b3, \b3, \b4 77 eor \b5, \b5, \b0 78 eor \b3, \b3, \b7 79 eor \b6, \b6, \b2 80 eor \b2, \b2, \b1 81 eor \b6, \b6, \b3 82 eor \b3, \b3, \b0 83 eor \b5, \b5, \b6 84 .endm 85 86 .macro mul_gf4, x0, x1, y0, y1, t0, t1 87 eor \t0, \y0, \y1 88 and \t0, \t0, \x0 89 eor \x0, \x0, \x1 90 and \t1, \x1, \y0 91 and \x0, \x0, \y1 92 eor \x1, \t1, \t0 93 eor \x0, \x0, \t1 94 .endm 95 96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 97 eor \t0, \y0, \y1 98 eor \t1, \y2, \y3 99 and \t0, \t0, \x0 100 and \t1, \t1, \x2 101 eor \x0, \x0, \x1 102 eor \x2, \x2, \x3 103 and \x1, \x1, \y0 104 and \x3, \x3, \y2 105 and \x0, \x0, \y1 106 and \x2, \x2, \y3 107 eor \x1, \x1, \x0 108 eor \x2, \x2, \x3 109 eor \x0, \x0, \t0 110 eor \x3, \x3, \t1 111 .endm 112 113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 114 y0, y1, y2, y3, t0, t1, t2, t3 115 eor \t0, \x0, \x2 116 eor \t1, \x1, \x3 117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 118 eor \y0, \y0, \y2 119 eor \y1, \y1, \y3 120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 121 eor \x0, \x0, \t0 122 eor \x2, \x2, \t0 123 eor \x1, \x1, \t1 124 eor \x3, \x3, \t1 125 eor \t0, \x4, \x6 126 eor \t1, \x5, \x7 127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 128 eor \y0, \y0, \y2 129 eor \y1, \y1, \y3 130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 131 eor \x4, \x4, \t0 132 eor \x6, \x6, \t0 133 eor \x5, \x5, \t1 134 eor \x7, \x7, \t1 135 .endm 136 137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 138 t0, t1, t2, t3, s0, s1, s2, s3 139 eor \t3, \x4, \x6 140 eor \t0, \x5, \x7 141 eor \t1, \x1, \x3 142 eor \s1, \x7, \x6 143 eor \s0, \x0, \x2 144 eor \s3, \t3, \t0 145 orr \t2, \t0, \t1 146 and \s2, \t3, \s0 147 orr \t3, \t3, \s0 148 eor \s0, \s0, \t1 149 and \t0, \t0, \t1 150 eor \t1, \x3, \x2 151 and \s3, \s3, \s0 152 and \s1, \s1, \t1 153 eor \t1, \x4, \x5 154 eor \s0, \x1, \x0 155 eor \t3, \t3, \s1 156 eor \t2, \t2, \s1 157 and \s1, \t1, \s0 158 orr \t1, \t1, \s0 159 eor \t3, \t3, \s3 160 eor \t0, \t0, \s1 161 eor \t2, \t2, \s2 162 eor \t1, \t1, \s3 163 eor \t0, \t0, \s2 164 and \s0, \x7, \x3 165 eor \t1, \t1, \s2 166 and \s1, \x6, \x2 167 and \s2, \x5, \x1 168 orr \s3, \x4, \x0 169 eor \t3, \t3, \s0 170 eor \t1, \t1, \s2 171 eor \s0, \t0, \s3 172 eor \t2, \t2, \s1 173 and \s2, \t3, \t1 174 eor \s1, \t2, \s2 175 eor \s3, \s0, \s2 176 bsl \s1, \t1, \s0 177 not \t0, \s0 178 bsl \s0, \s1, \s3 179 bsl \t0, \s1, \s3 180 bsl \s3, \t3, \t2 181 eor \t3, \t3, \t2 182 and \s2, \s0, \s3 183 eor \t1, \t1, \t0 184 eor \s2, \s2, \t3 185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 187 .endm 188 189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 190 t0, t1, t2, t3, s0, s1, s2, s3 191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 199 .endm 200 201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 202 t0, t1, t2, t3, s0, s1, s2, s3 203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 211 .endm 212 213 .macro enc_next_rk 214 ldp q16, q17, [bskey], #128 215 ldp q18, q19, [bskey, #-96] 216 ldp q20, q21, [bskey, #-64] 217 ldp q22, q23, [bskey, #-32] 218 .endm 219 220 .macro dec_next_rk 221 ldp q16, q17, [bskey, #-128]! 222 ldp q18, q19, [bskey, #32] 223 ldp q20, q21, [bskey, #64] 224 ldp q22, q23, [bskey, #96] 225 .endm 226 227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 228 eor \x0\().16b, \x0\().16b, v16.16b 229 eor \x1\().16b, \x1\().16b, v17.16b 230 eor \x2\().16b, \x2\().16b, v18.16b 231 eor \x3\().16b, \x3\().16b, v19.16b 232 eor \x4\().16b, \x4\().16b, v20.16b 233 eor \x5\().16b, \x5\().16b, v21.16b 234 eor \x6\().16b, \x6\().16b, v22.16b 235 eor \x7\().16b, \x7\().16b, v23.16b 236 .endm 237 238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 247 .endm 248 249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 250 t0, t1, t2, t3, t4, t5, t6, t7, inv 251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 253 eor \x0\().16b, \x0\().16b, \t0\().16b 254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 255 eor \x1\().16b, \x1\().16b, \t1\().16b 256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 257 eor \x2\().16b, \x2\().16b, \t2\().16b 258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 259 eor \x3\().16b, \x3\().16b, \t3\().16b 260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 261 eor \x4\().16b, \x4\().16b, \t4\().16b 262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 263 eor \x5\().16b, \x5\().16b, \t5\().16b 264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 265 eor \x6\().16b, \x6\().16b, \t6\().16b 266 eor \t1\().16b, \t1\().16b, \x0\().16b 267 eor \x7\().16b, \x7\().16b, \t7\().16b 268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 269 eor \t2\().16b, \t2\().16b, \x1\().16b 270 eor \t0\().16b, \t0\().16b, \x7\().16b 271 eor \t1\().16b, \t1\().16b, \x7\().16b 272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 273 eor \t5\().16b, \t5\().16b, \x4\().16b 274 eor \x0\().16b, \x0\().16b, \t0\().16b 275 eor \t6\().16b, \t6\().16b, \x5\().16b 276 eor \x1\().16b, \x1\().16b, \t1\().16b 277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 278 eor \t4\().16b, \t4\().16b, \x3\().16b 279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 280 eor \t7\().16b, \t7\().16b, \x6\().16b 281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 282 eor \t3\().16b, \t3\().16b, \x2\().16b 283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 284 eor \t4\().16b, \t4\().16b, \x7\().16b 285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 286 eor \t3\().16b, \t3\().16b, \x7\().16b 287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 288 eor \x7\().16b, \t1\().16b, \t5\().16b 289 .ifb \inv 290 eor \x2\().16b, \t0\().16b, \t4\().16b 291 eor \x4\().16b, \x4\().16b, \t3\().16b 292 eor \x5\().16b, \x5\().16b, \t7\().16b 293 eor \x3\().16b, \x3\().16b, \t6\().16b 294 eor \x6\().16b, \x6\().16b, \t2\().16b 295 .else 296 eor \t3\().16b, \t3\().16b, \x4\().16b 297 eor \x5\().16b, \x5\().16b, \t7\().16b 298 eor \x2\().16b, \x3\().16b, \t6\().16b 299 eor \x3\().16b, \t0\().16b, \t4\().16b 300 eor \x4\().16b, \x6\().16b, \t2\().16b 301 mov \x6\().16b, \t3\().16b 302 .endif 303 .endm 304 305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 306 t0, t1, t2, t3, t4, t5, t6, t7 307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 310 eor \t0\().16b, \t0\().16b, \x0\().16b 311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 312 eor \t6\().16b, \t6\().16b, \x6\().16b 313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 314 eor \t7\().16b, \t7\().16b, \x7\().16b 315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 316 eor \t1\().16b, \t1\().16b, \x1\().16b 317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 318 eor \t2\().16b, \t2\().16b, \x2\().16b 319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 320 eor \t3\().16b, \t3\().16b, \x3\().16b 321 eor \t4\().16b, \t4\().16b, \x4\().16b 322 eor \t5\().16b, \t5\().16b, \x5\().16b 323 eor \x0\().16b, \x0\().16b, \t6\().16b 324 eor \x1\().16b, \x1\().16b, \t6\().16b 325 eor \x2\().16b, \x2\().16b, \t0\().16b 326 eor \x4\().16b, \x4\().16b, \t2\().16b 327 eor \x3\().16b, \x3\().16b, \t1\().16b 328 eor \x1\().16b, \x1\().16b, \t7\().16b 329 eor \x2\().16b, \x2\().16b, \t7\().16b 330 eor \x4\().16b, \x4\().16b, \t6\().16b 331 eor \x5\().16b, \x5\().16b, \t3\().16b 332 eor \x3\().16b, \x3\().16b, \t6\().16b 333 eor \x6\().16b, \x6\().16b, \t4\().16b 334 eor \x4\().16b, \x4\().16b, \t7\().16b 335 eor \x5\().16b, \x5\().16b, \t7\().16b 336 eor \x7\().16b, \x7\().16b, \t5\().16b 337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 339 .endm 340 341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 342 ushr \t0\().2d, \b0\().2d, #\n 343 ushr \t1\().2d, \b1\().2d, #\n 344 eor \t0\().16b, \t0\().16b, \a0\().16b 345 eor \t1\().16b, \t1\().16b, \a1\().16b 346 and \t0\().16b, \t0\().16b, \mask\().16b 347 and \t1\().16b, \t1\().16b, \mask\().16b 348 eor \a0\().16b, \a0\().16b, \t0\().16b 349 shl \t0\().2d, \t0\().2d, #\n 350 eor \a1\().16b, \a1\().16b, \t1\().16b 351 shl \t1\().2d, \t1\().2d, #\n 352 eor \b0\().16b, \b0\().16b, \t0\().16b 353 eor \b1\().16b, \b1\().16b, \t1\().16b 354 .endm 355 356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 357 movi \t0\().16b, #0x55 358 movi \t1\().16b, #0x33 359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 361 movi \t0\().16b, #0x0f 362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 366 .endm 367 368 369 .align 6 370M0: .octa 0x0004080c0105090d02060a0e03070b0f 371 372M0SR: .octa 0x0004080c05090d010a0e02060f03070b 373SR: .octa 0x0f0e0d0c0a09080b0504070600030201 374SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 375 376M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 377ISR: .octa 0x0f0e0d0c080b0a090504070602010003 378ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 379 380 /* 381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 382 */ 383SYM_FUNC_START(aesbs_convert_key) 384 ld1 {v7.4s}, [x1], #16 // load round 0 key 385 ld1 {v17.4s}, [x1], #16 // load round 1 key 386 387 movi v8.16b, #0x01 // bit masks 388 movi v9.16b, #0x02 389 movi v10.16b, #0x04 390 movi v11.16b, #0x08 391 movi v12.16b, #0x10 392 movi v13.16b, #0x20 393 movi v14.16b, #0x40 394 movi v15.16b, #0x80 395 ldr q16, M0 396 397 sub x2, x2, #1 398 str q7, [x0], #16 // save round 0 key 399 400.Lkey_loop: 401 tbl v7.16b ,{v17.16b}, v16.16b 402 ld1 {v17.4s}, [x1], #16 // load next round key 403 404 cmtst v0.16b, v7.16b, v8.16b 405 cmtst v1.16b, v7.16b, v9.16b 406 cmtst v2.16b, v7.16b, v10.16b 407 cmtst v3.16b, v7.16b, v11.16b 408 cmtst v4.16b, v7.16b, v12.16b 409 cmtst v5.16b, v7.16b, v13.16b 410 cmtst v6.16b, v7.16b, v14.16b 411 cmtst v7.16b, v7.16b, v15.16b 412 not v0.16b, v0.16b 413 not v1.16b, v1.16b 414 not v5.16b, v5.16b 415 not v6.16b, v6.16b 416 417 subs x2, x2, #1 418 stp q0, q1, [x0], #128 419 stp q2, q3, [x0, #-96] 420 stp q4, q5, [x0, #-64] 421 stp q6, q7, [x0, #-32] 422 b.ne .Lkey_loop 423 424 movi v7.16b, #0x63 // compose .L63 425 eor v17.16b, v17.16b, v7.16b 426 str q17, [x0] 427 ret 428SYM_FUNC_END(aesbs_convert_key) 429 430 .align 4 431SYM_FUNC_START_LOCAL(aesbs_encrypt8) 432 ldr q9, [bskey], #16 // round 0 key 433 ldr q8, M0SR 434 ldr q24, SR 435 436 eor v10.16b, v0.16b, v9.16b // xor with round0 key 437 eor v11.16b, v1.16b, v9.16b 438 tbl v0.16b, {v10.16b}, v8.16b 439 eor v12.16b, v2.16b, v9.16b 440 tbl v1.16b, {v11.16b}, v8.16b 441 eor v13.16b, v3.16b, v9.16b 442 tbl v2.16b, {v12.16b}, v8.16b 443 eor v14.16b, v4.16b, v9.16b 444 tbl v3.16b, {v13.16b}, v8.16b 445 eor v15.16b, v5.16b, v9.16b 446 tbl v4.16b, {v14.16b}, v8.16b 447 eor v10.16b, v6.16b, v9.16b 448 tbl v5.16b, {v15.16b}, v8.16b 449 eor v11.16b, v7.16b, v9.16b 450 tbl v6.16b, {v10.16b}, v8.16b 451 tbl v7.16b, {v11.16b}, v8.16b 452 453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 454 455 sub rounds, rounds, #1 456 b .Lenc_sbox 457 458.Lenc_loop: 459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 460.Lenc_sbox: 461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 462 v13, v14, v15 463 subs rounds, rounds, #1 464 b.cc .Lenc_done 465 466 enc_next_rk 467 468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 469 v13, v14, v15 470 471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 472 473 b.ne .Lenc_loop 474 ldr q24, SRM0 475 b .Lenc_loop 476 477.Lenc_done: 478 ldr q12, [bskey] // last round key 479 480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 481 482 eor v0.16b, v0.16b, v12.16b 483 eor v1.16b, v1.16b, v12.16b 484 eor v4.16b, v4.16b, v12.16b 485 eor v6.16b, v6.16b, v12.16b 486 eor v3.16b, v3.16b, v12.16b 487 eor v7.16b, v7.16b, v12.16b 488 eor v2.16b, v2.16b, v12.16b 489 eor v5.16b, v5.16b, v12.16b 490 ret 491SYM_FUNC_END(aesbs_encrypt8) 492 493 .align 4 494SYM_FUNC_START_LOCAL(aesbs_decrypt8) 495 lsl x9, rounds, #7 496 add bskey, bskey, x9 497 498 ldr q9, [bskey, #-112]! // round 0 key 499 ldr q8, M0ISR 500 ldr q24, ISR 501 502 eor v10.16b, v0.16b, v9.16b // xor with round0 key 503 eor v11.16b, v1.16b, v9.16b 504 tbl v0.16b, {v10.16b}, v8.16b 505 eor v12.16b, v2.16b, v9.16b 506 tbl v1.16b, {v11.16b}, v8.16b 507 eor v13.16b, v3.16b, v9.16b 508 tbl v2.16b, {v12.16b}, v8.16b 509 eor v14.16b, v4.16b, v9.16b 510 tbl v3.16b, {v13.16b}, v8.16b 511 eor v15.16b, v5.16b, v9.16b 512 tbl v4.16b, {v14.16b}, v8.16b 513 eor v10.16b, v6.16b, v9.16b 514 tbl v5.16b, {v15.16b}, v8.16b 515 eor v11.16b, v7.16b, v9.16b 516 tbl v6.16b, {v10.16b}, v8.16b 517 tbl v7.16b, {v11.16b}, v8.16b 518 519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 520 521 sub rounds, rounds, #1 522 b .Ldec_sbox 523 524.Ldec_loop: 525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 526.Ldec_sbox: 527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 528 v13, v14, v15 529 subs rounds, rounds, #1 530 b.cc .Ldec_done 531 532 dec_next_rk 533 534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 535 536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 537 v13, v14, v15 538 539 b.ne .Ldec_loop 540 ldr q24, ISRM0 541 b .Ldec_loop 542.Ldec_done: 543 ldr q12, [bskey, #-16] // last round key 544 545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 546 547 eor v0.16b, v0.16b, v12.16b 548 eor v1.16b, v1.16b, v12.16b 549 eor v6.16b, v6.16b, v12.16b 550 eor v4.16b, v4.16b, v12.16b 551 eor v2.16b, v2.16b, v12.16b 552 eor v7.16b, v7.16b, v12.16b 553 eor v3.16b, v3.16b, v12.16b 554 eor v5.16b, v5.16b, v12.16b 555 ret 556SYM_FUNC_END(aesbs_decrypt8) 557 558 /* 559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 560 * int blocks) 561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 562 * int blocks) 563 */ 564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 565 frame_push 5 566 567 mov x19, x0 568 mov x20, x1 569 mov x21, x2 570 mov x22, x3 571 mov x23, x4 572 57399: mov x5, #1 574 lsl x5, x5, x23 575 subs w23, w23, #8 576 csel x23, x23, xzr, pl 577 csel x5, x5, xzr, mi 578 579 ld1 {v0.16b}, [x20], #16 580 tbnz x5, #1, 0f 581 ld1 {v1.16b}, [x20], #16 582 tbnz x5, #2, 0f 583 ld1 {v2.16b}, [x20], #16 584 tbnz x5, #3, 0f 585 ld1 {v3.16b}, [x20], #16 586 tbnz x5, #4, 0f 587 ld1 {v4.16b}, [x20], #16 588 tbnz x5, #5, 0f 589 ld1 {v5.16b}, [x20], #16 590 tbnz x5, #6, 0f 591 ld1 {v6.16b}, [x20], #16 592 tbnz x5, #7, 0f 593 ld1 {v7.16b}, [x20], #16 594 5950: mov bskey, x21 596 mov rounds, x22 597 bl \do8 598 599 st1 {\o0\().16b}, [x19], #16 600 tbnz x5, #1, 1f 601 st1 {\o1\().16b}, [x19], #16 602 tbnz x5, #2, 1f 603 st1 {\o2\().16b}, [x19], #16 604 tbnz x5, #3, 1f 605 st1 {\o3\().16b}, [x19], #16 606 tbnz x5, #4, 1f 607 st1 {\o4\().16b}, [x19], #16 608 tbnz x5, #5, 1f 609 st1 {\o5\().16b}, [x19], #16 610 tbnz x5, #6, 1f 611 st1 {\o6\().16b}, [x19], #16 612 tbnz x5, #7, 1f 613 st1 {\o7\().16b}, [x19], #16 614 615 cbz x23, 1f 616 b 99b 617 6181: frame_pop 619 ret 620 .endm 621 622 .align 4 623SYM_FUNC_START(aesbs_ecb_encrypt) 624 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 625SYM_FUNC_END(aesbs_ecb_encrypt) 626 627 .align 4 628SYM_FUNC_START(aesbs_ecb_decrypt) 629 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 630SYM_FUNC_END(aesbs_ecb_decrypt) 631 632 /* 633 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 634 * int blocks, u8 iv[]) 635 */ 636 .align 4 637SYM_FUNC_START(aesbs_cbc_decrypt) 638 frame_push 6 639 640 mov x19, x0 641 mov x20, x1 642 mov x21, x2 643 mov x22, x3 644 mov x23, x4 645 mov x24, x5 646 64799: mov x6, #1 648 lsl x6, x6, x23 649 subs w23, w23, #8 650 csel x23, x23, xzr, pl 651 csel x6, x6, xzr, mi 652 653 ld1 {v0.16b}, [x20], #16 654 mov v25.16b, v0.16b 655 tbnz x6, #1, 0f 656 ld1 {v1.16b}, [x20], #16 657 mov v26.16b, v1.16b 658 tbnz x6, #2, 0f 659 ld1 {v2.16b}, [x20], #16 660 mov v27.16b, v2.16b 661 tbnz x6, #3, 0f 662 ld1 {v3.16b}, [x20], #16 663 mov v28.16b, v3.16b 664 tbnz x6, #4, 0f 665 ld1 {v4.16b}, [x20], #16 666 mov v29.16b, v4.16b 667 tbnz x6, #5, 0f 668 ld1 {v5.16b}, [x20], #16 669 mov v30.16b, v5.16b 670 tbnz x6, #6, 0f 671 ld1 {v6.16b}, [x20], #16 672 mov v31.16b, v6.16b 673 tbnz x6, #7, 0f 674 ld1 {v7.16b}, [x20] 675 6760: mov bskey, x21 677 mov rounds, x22 678 bl aesbs_decrypt8 679 680 ld1 {v24.16b}, [x24] // load IV 681 682 eor v1.16b, v1.16b, v25.16b 683 eor v6.16b, v6.16b, v26.16b 684 eor v4.16b, v4.16b, v27.16b 685 eor v2.16b, v2.16b, v28.16b 686 eor v7.16b, v7.16b, v29.16b 687 eor v0.16b, v0.16b, v24.16b 688 eor v3.16b, v3.16b, v30.16b 689 eor v5.16b, v5.16b, v31.16b 690 691 st1 {v0.16b}, [x19], #16 692 mov v24.16b, v25.16b 693 tbnz x6, #1, 1f 694 st1 {v1.16b}, [x19], #16 695 mov v24.16b, v26.16b 696 tbnz x6, #2, 1f 697 st1 {v6.16b}, [x19], #16 698 mov v24.16b, v27.16b 699 tbnz x6, #3, 1f 700 st1 {v4.16b}, [x19], #16 701 mov v24.16b, v28.16b 702 tbnz x6, #4, 1f 703 st1 {v2.16b}, [x19], #16 704 mov v24.16b, v29.16b 705 tbnz x6, #5, 1f 706 st1 {v7.16b}, [x19], #16 707 mov v24.16b, v30.16b 708 tbnz x6, #6, 1f 709 st1 {v3.16b}, [x19], #16 710 mov v24.16b, v31.16b 711 tbnz x6, #7, 1f 712 ld1 {v24.16b}, [x20], #16 713 st1 {v5.16b}, [x19], #16 7141: st1 {v24.16b}, [x24] // store IV 715 716 cbz x23, 2f 717 b 99b 718 7192: frame_pop 720 ret 721SYM_FUNC_END(aesbs_cbc_decrypt) 722 723 .macro next_tweak, out, in, const, tmp 724 sshr \tmp\().2d, \in\().2d, #63 725 and \tmp\().16b, \tmp\().16b, \const\().16b 726 add \out\().2d, \in\().2d, \in\().2d 727 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 728 eor \out\().16b, \out\().16b, \tmp\().16b 729 .endm 730 731 /* 732 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 733 * int blocks, u8 iv[]) 734 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 735 * int blocks, u8 iv[]) 736 */ 737SYM_FUNC_START_LOCAL(__xts_crypt8) 738 movi v18.2s, #0x1 739 movi v19.2s, #0x87 740 uzp1 v18.4s, v18.4s, v19.4s 741 742 ld1 {v0.16b-v3.16b}, [x1], #64 743 ld1 {v4.16b-v7.16b}, [x1], #64 744 745 next_tweak v26, v25, v18, v19 746 next_tweak v27, v26, v18, v19 747 next_tweak v28, v27, v18, v19 748 next_tweak v29, v28, v18, v19 749 next_tweak v30, v29, v18, v19 750 next_tweak v31, v30, v18, v19 751 next_tweak v16, v31, v18, v19 752 next_tweak v17, v16, v18, v19 753 754 eor v0.16b, v0.16b, v25.16b 755 eor v1.16b, v1.16b, v26.16b 756 eor v2.16b, v2.16b, v27.16b 757 eor v3.16b, v3.16b, v28.16b 758 eor v4.16b, v4.16b, v29.16b 759 eor v5.16b, v5.16b, v30.16b 760 eor v6.16b, v6.16b, v31.16b 761 eor v7.16b, v7.16b, v16.16b 762 763 stp q16, q17, [sp, #16] 764 765 mov bskey, x2 766 mov rounds, x3 767 br x16 768SYM_FUNC_END(__xts_crypt8) 769 770 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 771 stp x29, x30, [sp, #-48]! 772 mov x29, sp 773 774 ld1 {v25.16b}, [x5] 775 7760: adr x16, \do8 777 bl __xts_crypt8 778 779 eor v16.16b, \o0\().16b, v25.16b 780 eor v17.16b, \o1\().16b, v26.16b 781 eor v18.16b, \o2\().16b, v27.16b 782 eor v19.16b, \o3\().16b, v28.16b 783 784 ldp q24, q25, [sp, #16] 785 786 eor v20.16b, \o4\().16b, v29.16b 787 eor v21.16b, \o5\().16b, v30.16b 788 eor v22.16b, \o6\().16b, v31.16b 789 eor v23.16b, \o7\().16b, v24.16b 790 791 st1 {v16.16b-v19.16b}, [x0], #64 792 st1 {v20.16b-v23.16b}, [x0], #64 793 794 subs x4, x4, #8 795 b.gt 0b 796 797 st1 {v25.16b}, [x5] 798 ldp x29, x30, [sp], #48 799 ret 800 .endm 801 802SYM_FUNC_START(aesbs_xts_encrypt) 803 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 804SYM_FUNC_END(aesbs_xts_encrypt) 805 806SYM_FUNC_START(aesbs_xts_decrypt) 807 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 808SYM_FUNC_END(aesbs_xts_decrypt) 809 810 .macro next_ctr, v 811 mov \v\().d[1], x8 812 adds x8, x8, #1 813 mov \v\().d[0], x7 814 adc x7, x7, xzr 815 rev64 \v\().16b, \v\().16b 816 .endm 817 818 /* 819 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 820 * int rounds, int blocks, u8 iv[]) 821 */ 822SYM_FUNC_START(aesbs_ctr_encrypt) 823 stp x29, x30, [sp, #-16]! 824 mov x29, sp 825 826 ldp x7, x8, [x5] 827 ld1 {v0.16b}, [x5] 828CPU_LE( rev x7, x7 ) 829CPU_LE( rev x8, x8 ) 830 adds x8, x8, #1 831 adc x7, x7, xzr 832 8330: next_ctr v1 834 next_ctr v2 835 next_ctr v3 836 next_ctr v4 837 next_ctr v5 838 next_ctr v6 839 next_ctr v7 840 841 mov bskey, x2 842 mov rounds, x3 843 bl aesbs_encrypt8 844 845 ld1 { v8.16b-v11.16b}, [x1], #64 846 ld1 {v12.16b-v15.16b}, [x1], #64 847 848 eor v8.16b, v0.16b, v8.16b 849 eor v9.16b, v1.16b, v9.16b 850 eor v10.16b, v4.16b, v10.16b 851 eor v11.16b, v6.16b, v11.16b 852 eor v12.16b, v3.16b, v12.16b 853 eor v13.16b, v7.16b, v13.16b 854 eor v14.16b, v2.16b, v14.16b 855 eor v15.16b, v5.16b, v15.16b 856 857 st1 { v8.16b-v11.16b}, [x0], #64 858 st1 {v12.16b-v15.16b}, [x0], #64 859 860 next_ctr v0 861 subs x4, x4, #8 862 b.gt 0b 863 864 st1 {v0.16b}, [x5] 865 ldp x29, x30, [sp], #16 866 ret 867SYM_FUNC_END(aesbs_ctr_encrypt)