ghash-ce-core.S (17529B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 4 * 5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 SHASH .req v0 12 SHASH2 .req v1 13 T1 .req v2 14 T2 .req v3 15 MASK .req v4 16 XM .req v5 17 XL .req v6 18 XH .req v7 19 IN1 .req v7 20 21 k00_16 .req v8 22 k32_48 .req v9 23 24 t3 .req v10 25 t4 .req v11 26 t5 .req v12 27 t6 .req v13 28 t7 .req v14 29 t8 .req v15 30 t9 .req v16 31 32 perm1 .req v17 33 perm2 .req v18 34 perm3 .req v19 35 36 sh1 .req v20 37 sh2 .req v21 38 sh3 .req v22 39 sh4 .req v23 40 41 ss1 .req v24 42 ss2 .req v25 43 ss3 .req v26 44 ss4 .req v27 45 46 XL2 .req v8 47 XM2 .req v9 48 XH2 .req v10 49 XL3 .req v11 50 XM3 .req v12 51 XH3 .req v13 52 TT3 .req v14 53 TT4 .req v15 54 HH .req v16 55 HH3 .req v17 56 HH4 .req v18 57 HH34 .req v19 58 59 .text 60 .arch armv8-a+crypto 61 62 .macro __pmull_p64, rd, rn, rm 63 pmull \rd\().1q, \rn\().1d, \rm\().1d 64 .endm 65 66 .macro __pmull2_p64, rd, rn, rm 67 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 68 .endm 69 70 .macro __pmull_p8, rq, ad, bd 71 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 72 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 73 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 74 75 __pmull_p8_\bd \rq, \ad 76 .endm 77 78 .macro __pmull2_p8, rq, ad, bd 79 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 80 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 81 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 82 83 __pmull2_p8_\bd \rq, \ad 84 .endm 85 86 .macro __pmull_p8_SHASH, rq, ad 87 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 88 .endm 89 90 .macro __pmull_p8_SHASH2, rq, ad 91 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 92 .endm 93 94 .macro __pmull2_p8_SHASH, rq, ad 95 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 96 .endm 97 98 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 99 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 100 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 101 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 102 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 103 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 104 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 105 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 106 pmull\t \rq\().8h, \ad, \bd // D = A*B 107 108 eor t3.16b, t3.16b, t4.16b // L = E + F 109 eor t5.16b, t5.16b, t6.16b // M = G + H 110 eor t7.16b, t7.16b, t8.16b // N = I + J 111 112 uzp1 t4.2d, t3.2d, t5.2d 113 uzp2 t3.2d, t3.2d, t5.2d 114 uzp1 t6.2d, t7.2d, t9.2d 115 uzp2 t7.2d, t7.2d, t9.2d 116 117 // t3 = (L) (P0 + P1) << 8 118 // t5 = (M) (P2 + P3) << 16 119 eor t4.16b, t4.16b, t3.16b 120 and t3.16b, t3.16b, k32_48.16b 121 122 // t7 = (N) (P4 + P5) << 24 123 // t9 = (K) (P6 + P7) << 32 124 eor t6.16b, t6.16b, t7.16b 125 and t7.16b, t7.16b, k00_16.16b 126 127 eor t4.16b, t4.16b, t3.16b 128 eor t6.16b, t6.16b, t7.16b 129 130 zip2 t5.2d, t4.2d, t3.2d 131 zip1 t3.2d, t4.2d, t3.2d 132 zip2 t9.2d, t6.2d, t7.2d 133 zip1 t7.2d, t6.2d, t7.2d 134 135 ext t3.16b, t3.16b, t3.16b, #15 136 ext t5.16b, t5.16b, t5.16b, #14 137 ext t7.16b, t7.16b, t7.16b, #13 138 ext t9.16b, t9.16b, t9.16b, #12 139 140 eor t3.16b, t3.16b, t5.16b 141 eor t7.16b, t7.16b, t9.16b 142 eor \rq\().16b, \rq\().16b, t3.16b 143 eor \rq\().16b, \rq\().16b, t7.16b 144 .endm 145 146 .macro __pmull_pre_p64 147 add x8, x3, #16 148 ld1 {HH.2d-HH4.2d}, [x8] 149 150 trn1 SHASH2.2d, SHASH.2d, HH.2d 151 trn2 T1.2d, SHASH.2d, HH.2d 152 eor SHASH2.16b, SHASH2.16b, T1.16b 153 154 trn1 HH34.2d, HH3.2d, HH4.2d 155 trn2 T1.2d, HH3.2d, HH4.2d 156 eor HH34.16b, HH34.16b, T1.16b 157 158 movi MASK.16b, #0xe1 159 shl MASK.2d, MASK.2d, #57 160 .endm 161 162 .macro __pmull_pre_p8 163 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 164 eor SHASH2.16b, SHASH2.16b, SHASH.16b 165 166 // k00_16 := 0x0000000000000000_000000000000ffff 167 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 168 movi k32_48.2d, #0xffffffff 169 mov k32_48.h[2], k32_48.h[0] 170 ushr k00_16.2d, k32_48.2d, #32 171 172 // prepare the permutation vectors 173 mov_q x5, 0x080f0e0d0c0b0a09 174 movi T1.8b, #8 175 dup perm1.2d, x5 176 eor perm1.16b, perm1.16b, T1.16b 177 ushr perm2.2d, perm1.2d, #8 178 ushr perm3.2d, perm1.2d, #16 179 ushr T1.2d, perm1.2d, #24 180 sli perm2.2d, perm1.2d, #56 181 sli perm3.2d, perm1.2d, #48 182 sli T1.2d, perm1.2d, #40 183 184 // precompute loop invariants 185 tbl sh1.16b, {SHASH.16b}, perm1.16b 186 tbl sh2.16b, {SHASH.16b}, perm2.16b 187 tbl sh3.16b, {SHASH.16b}, perm3.16b 188 tbl sh4.16b, {SHASH.16b}, T1.16b 189 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 190 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 191 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 192 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 193 .endm 194 195 // 196 // PMULL (64x64->128) based reduction for CPUs that can do 197 // it in a single instruction. 198 // 199 .macro __pmull_reduce_p64 200 pmull T2.1q, XL.1d, MASK.1d 201 eor XM.16b, XM.16b, T1.16b 202 203 mov XH.d[0], XM.d[1] 204 mov XM.d[1], XL.d[0] 205 206 eor XL.16b, XM.16b, T2.16b 207 ext T2.16b, XL.16b, XL.16b, #8 208 pmull XL.1q, XL.1d, MASK.1d 209 .endm 210 211 // 212 // Alternative reduction for CPUs that lack support for the 213 // 64x64->128 PMULL instruction 214 // 215 .macro __pmull_reduce_p8 216 eor XM.16b, XM.16b, T1.16b 217 218 mov XL.d[1], XM.d[0] 219 mov XH.d[0], XM.d[1] 220 221 shl T1.2d, XL.2d, #57 222 shl T2.2d, XL.2d, #62 223 eor T2.16b, T2.16b, T1.16b 224 shl T1.2d, XL.2d, #63 225 eor T2.16b, T2.16b, T1.16b 226 ext T1.16b, XL.16b, XH.16b, #8 227 eor T2.16b, T2.16b, T1.16b 228 229 mov XL.d[1], T2.d[0] 230 mov XH.d[0], T2.d[1] 231 232 ushr T2.2d, XL.2d, #1 233 eor XH.16b, XH.16b, XL.16b 234 eor XL.16b, XL.16b, T2.16b 235 ushr T2.2d, T2.2d, #6 236 ushr XL.2d, XL.2d, #1 237 .endm 238 239 .macro __pmull_ghash, pn 240 ld1 {SHASH.2d}, [x3] 241 ld1 {XL.2d}, [x1] 242 243 __pmull_pre_\pn 244 245 /* do the head block first, if supplied */ 246 cbz x4, 0f 247 ld1 {T1.2d}, [x4] 248 mov x4, xzr 249 b 3f 250 2510: .ifc \pn, p64 252 tbnz w0, #0, 2f // skip until #blocks is a 253 tbnz w0, #1, 2f // round multiple of 4 254 2551: ld1 {XM3.16b-TT4.16b}, [x2], #64 256 257 sub w0, w0, #4 258 259 rev64 T1.16b, XM3.16b 260 rev64 T2.16b, XH3.16b 261 rev64 TT4.16b, TT4.16b 262 rev64 TT3.16b, TT3.16b 263 264 ext IN1.16b, TT4.16b, TT4.16b, #8 265 ext XL3.16b, TT3.16b, TT3.16b, #8 266 267 eor TT4.16b, TT4.16b, IN1.16b 268 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 269 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 270 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 271 272 eor TT3.16b, TT3.16b, XL3.16b 273 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 274 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 275 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 276 277 ext IN1.16b, T2.16b, T2.16b, #8 278 eor XL2.16b, XL2.16b, XL3.16b 279 eor XH2.16b, XH2.16b, XH3.16b 280 eor XM2.16b, XM2.16b, XM3.16b 281 282 eor T2.16b, T2.16b, IN1.16b 283 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 284 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 285 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 286 287 eor XL2.16b, XL2.16b, XL3.16b 288 eor XH2.16b, XH2.16b, XH3.16b 289 eor XM2.16b, XM2.16b, XM3.16b 290 291 ext IN1.16b, T1.16b, T1.16b, #8 292 ext TT3.16b, XL.16b, XL.16b, #8 293 eor XL.16b, XL.16b, IN1.16b 294 eor T1.16b, T1.16b, TT3.16b 295 296 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 297 eor T1.16b, T1.16b, XL.16b 298 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 299 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 300 301 eor XL.16b, XL.16b, XL2.16b 302 eor XH.16b, XH.16b, XH2.16b 303 eor XM.16b, XM.16b, XM2.16b 304 305 eor T2.16b, XL.16b, XH.16b 306 ext T1.16b, XL.16b, XH.16b, #8 307 eor XM.16b, XM.16b, T2.16b 308 309 __pmull_reduce_p64 310 311 eor T2.16b, T2.16b, XH.16b 312 eor XL.16b, XL.16b, T2.16b 313 314 cbz w0, 5f 315 b 1b 316 .endif 317 3182: ld1 {T1.2d}, [x2], #16 319 sub w0, w0, #1 320 3213: /* multiply XL by SHASH in GF(2^128) */ 322CPU_LE( rev64 T1.16b, T1.16b ) 323 324 ext T2.16b, XL.16b, XL.16b, #8 325 ext IN1.16b, T1.16b, T1.16b, #8 326 eor T1.16b, T1.16b, T2.16b 327 eor XL.16b, XL.16b, IN1.16b 328 329 __pmull2_\pn XH, XL, SHASH // a1 * b1 330 eor T1.16b, T1.16b, XL.16b 331 __pmull_\pn XL, XL, SHASH // a0 * b0 332 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 333 3344: eor T2.16b, XL.16b, XH.16b 335 ext T1.16b, XL.16b, XH.16b, #8 336 eor XM.16b, XM.16b, T2.16b 337 338 __pmull_reduce_\pn 339 340 eor T2.16b, T2.16b, XH.16b 341 eor XL.16b, XL.16b, T2.16b 342 343 cbnz w0, 0b 344 3455: st1 {XL.2d}, [x1] 346 ret 347 .endm 348 349 /* 350 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 351 * struct ghash_key const *k, const char *head) 352 */ 353SYM_FUNC_START(pmull_ghash_update_p64) 354 __pmull_ghash p64 355SYM_FUNC_END(pmull_ghash_update_p64) 356 357SYM_FUNC_START(pmull_ghash_update_p8) 358 __pmull_ghash p8 359SYM_FUNC_END(pmull_ghash_update_p8) 360 361 KS0 .req v8 362 KS1 .req v9 363 KS2 .req v10 364 KS3 .req v11 365 366 INP0 .req v21 367 INP1 .req v22 368 INP2 .req v23 369 INP3 .req v24 370 371 K0 .req v25 372 K1 .req v26 373 K2 .req v27 374 K3 .req v28 375 K4 .req v12 376 K5 .req v13 377 K6 .req v4 378 K7 .req v5 379 K8 .req v14 380 K9 .req v15 381 KK .req v29 382 KL .req v30 383 KM .req v31 384 385 .macro load_round_keys, rounds, rk, tmp 386 add \tmp, \rk, #64 387 ld1 {K0.4s-K3.4s}, [\rk] 388 ld1 {K4.4s-K5.4s}, [\tmp] 389 add \tmp, \rk, \rounds, lsl #4 390 sub \tmp, \tmp, #32 391 ld1 {KK.4s-KM.4s}, [\tmp] 392 .endm 393 394 .macro enc_round, state, key 395 aese \state\().16b, \key\().16b 396 aesmc \state\().16b, \state\().16b 397 .endm 398 399 .macro enc_qround, s0, s1, s2, s3, key 400 enc_round \s0, \key 401 enc_round \s1, \key 402 enc_round \s2, \key 403 enc_round \s3, \key 404 .endm 405 406 .macro enc_block, state, rounds, rk, tmp 407 add \tmp, \rk, #96 408 ld1 {K6.4s-K7.4s}, [\tmp], #32 409 .irp key, K0, K1, K2, K3, K4 K5 410 enc_round \state, \key 411 .endr 412 413 tbnz \rounds, #2, .Lnot128_\@ 414.Lout256_\@: 415 enc_round \state, K6 416 enc_round \state, K7 417 418.Lout192_\@: 419 enc_round \state, KK 420 aese \state\().16b, KL.16b 421 eor \state\().16b, \state\().16b, KM.16b 422 423 .subsection 1 424.Lnot128_\@: 425 ld1 {K8.4s-K9.4s}, [\tmp], #32 426 enc_round \state, K6 427 enc_round \state, K7 428 ld1 {K6.4s-K7.4s}, [\tmp] 429 enc_round \state, K8 430 enc_round \state, K9 431 tbz \rounds, #1, .Lout192_\@ 432 b .Lout256_\@ 433 .previous 434 .endm 435 436 .align 6 437 .macro pmull_gcm_do_crypt, enc 438 stp x29, x30, [sp, #-32]! 439 mov x29, sp 440 str x19, [sp, #24] 441 442 load_round_keys x7, x6, x8 443 444 ld1 {SHASH.2d}, [x3], #16 445 ld1 {HH.2d-HH4.2d}, [x3] 446 447 trn1 SHASH2.2d, SHASH.2d, HH.2d 448 trn2 T1.2d, SHASH.2d, HH.2d 449 eor SHASH2.16b, SHASH2.16b, T1.16b 450 451 trn1 HH34.2d, HH3.2d, HH4.2d 452 trn2 T1.2d, HH3.2d, HH4.2d 453 eor HH34.16b, HH34.16b, T1.16b 454 455 ld1 {XL.2d}, [x4] 456 457 cbz x0, 3f // tag only? 458 459 ldr w8, [x5, #12] // load lower counter 460CPU_LE( rev w8, w8 ) 461 4620: mov w9, #4 // max blocks per round 463 add x10, x0, #0xf 464 lsr x10, x10, #4 // remaining blocks 465 466 subs x0, x0, #64 467 csel w9, w10, w9, mi 468 add w8, w8, w9 469 470 bmi 1f 471 ld1 {INP0.16b-INP3.16b}, [x2], #64 472 .subsection 1 473 /* 474 * Populate the four input registers right to left with up to 63 bytes 475 * of data, using overlapping loads to avoid branches. 476 * 477 * INP0 INP1 INP2 INP3 478 * 1 byte | | | |x | 479 * 16 bytes | | | |xxxxxxxx| 480 * 17 bytes | | |xxxxxxxx|x | 481 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | 482 * etc etc 483 * 484 * Note that this code may read up to 15 bytes before the start of 485 * the input. It is up to the calling code to ensure this is safe if 486 * this happens in the first iteration of the loop (i.e., when the 487 * input size is < 16 bytes) 488 */ 4891: mov x15, #16 490 ands x19, x0, #0xf 491 csel x19, x19, x15, ne 492 adr_l x17, .Lpermute_table + 16 493 494 sub x11, x15, x19 495 add x12, x17, x11 496 sub x17, x17, x11 497 ld1 {T1.16b}, [x12] 498 sub x10, x1, x11 499 sub x11, x2, x11 500 501 cmp x0, #-16 502 csel x14, x15, xzr, gt 503 cmp x0, #-32 504 csel x15, x15, xzr, gt 505 cmp x0, #-48 506 csel x16, x19, xzr, gt 507 csel x1, x1, x10, gt 508 csel x2, x2, x11, gt 509 510 ld1 {INP0.16b}, [x2], x14 511 ld1 {INP1.16b}, [x2], x15 512 ld1 {INP2.16b}, [x2], x16 513 ld1 {INP3.16b}, [x2] 514 tbl INP3.16b, {INP3.16b}, T1.16b 515 b 2f 516 .previous 517 5182: .if \enc == 0 519 bl pmull_gcm_ghash_4x 520 .endif 521 522 bl pmull_gcm_enc_4x 523 524 tbnz x0, #63, 6f 525 st1 {INP0.16b-INP3.16b}, [x1], #64 526 .if \enc == 1 527 bl pmull_gcm_ghash_4x 528 .endif 529 bne 0b 530 5313: ldp x19, x10, [sp, #24] 532 cbz x10, 5f // output tag? 533 534 ld1 {INP3.16b}, [x10] // load lengths[] 535 mov w9, #1 536 bl pmull_gcm_ghash_4x 537 538 mov w11, #(0x1 << 24) // BE '1U' 539 ld1 {KS0.16b}, [x5] 540 mov KS0.s[3], w11 541 542 enc_block KS0, x7, x6, x12 543 544 ext XL.16b, XL.16b, XL.16b, #8 545 rev64 XL.16b, XL.16b 546 eor XL.16b, XL.16b, KS0.16b 547 548 .if \enc == 1 549 st1 {XL.16b}, [x10] // store tag 550 .else 551 ldp x11, x12, [sp, #40] // load tag pointer and authsize 552 adr_l x17, .Lpermute_table 553 ld1 {KS0.16b}, [x11] // load supplied tag 554 add x17, x17, x12 555 ld1 {KS1.16b}, [x17] // load permute vector 556 557 cmeq XL.16b, XL.16b, KS0.16b // compare tags 558 mvn XL.16b, XL.16b // -1 for fail, 0 for pass 559 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only 560 sminv b0, XL.16b // signed minimum across XL 561 smov w0, v0.b[0] // return b0 562 .endif 563 5644: ldp x29, x30, [sp], #32 565 ret 566 5675: 568CPU_LE( rev w8, w8 ) 569 str w8, [x5, #12] // store lower counter 570 st1 {XL.2d}, [x4] 571 b 4b 572 5736: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors 574 sub x17, x17, x19, lsl #1 575 576 cmp w9, #1 577 beq 7f 578 .subsection 1 5797: ld1 {INP2.16b}, [x1] 580 tbx INP2.16b, {INP3.16b}, T1.16b 581 mov INP3.16b, INP2.16b 582 b 8f 583 .previous 584 585 st1 {INP0.16b}, [x1], x14 586 st1 {INP1.16b}, [x1], x15 587 st1 {INP2.16b}, [x1], x16 588 tbl INP3.16b, {INP3.16b}, T1.16b 589 tbx INP3.16b, {INP2.16b}, T2.16b 5908: st1 {INP3.16b}, [x1] 591 592 .if \enc == 1 593 ld1 {T1.16b}, [x17] 594 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits 595 bl pmull_gcm_ghash_4x 596 .endif 597 b 3b 598 .endm 599 600 /* 601 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], 602 * struct ghash_key const *k, u64 dg[], u8 ctr[], 603 * int rounds, u8 tag) 604 */ 605SYM_FUNC_START(pmull_gcm_encrypt) 606 pmull_gcm_do_crypt 1 607SYM_FUNC_END(pmull_gcm_encrypt) 608 609 /* 610 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], 611 * struct ghash_key const *k, u64 dg[], u8 ctr[], 612 * int rounds, u8 tag) 613 */ 614SYM_FUNC_START(pmull_gcm_decrypt) 615 pmull_gcm_do_crypt 0 616SYM_FUNC_END(pmull_gcm_decrypt) 617 618SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) 619 movi MASK.16b, #0xe1 620 shl MASK.2d, MASK.2d, #57 621 622 rev64 T1.16b, INP0.16b 623 rev64 T2.16b, INP1.16b 624 rev64 TT3.16b, INP2.16b 625 rev64 TT4.16b, INP3.16b 626 627 ext XL.16b, XL.16b, XL.16b, #8 628 629 tbz w9, #2, 0f // <4 blocks? 630 .subsection 1 6310: movi XH2.16b, #0 632 movi XM2.16b, #0 633 movi XL2.16b, #0 634 635 tbz w9, #0, 1f // 2 blocks? 636 tbz w9, #1, 2f // 1 block? 637 638 eor T2.16b, T2.16b, XL.16b 639 ext T1.16b, T2.16b, T2.16b, #8 640 b .Lgh3 641 6421: eor TT3.16b, TT3.16b, XL.16b 643 ext T2.16b, TT3.16b, TT3.16b, #8 644 b .Lgh2 645 6462: eor TT4.16b, TT4.16b, XL.16b 647 ext IN1.16b, TT4.16b, TT4.16b, #8 648 b .Lgh1 649 .previous 650 651 eor T1.16b, T1.16b, XL.16b 652 ext IN1.16b, T1.16b, T1.16b, #8 653 654 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 655 eor T1.16b, T1.16b, IN1.16b 656 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 657 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 658 659 ext T1.16b, T2.16b, T2.16b, #8 660.Lgh3: eor T2.16b, T2.16b, T1.16b 661 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 662 pmull XL.1q, HH3.1d, T1.1d // a0 * b0 663 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 664 665 eor XH2.16b, XH2.16b, XH.16b 666 eor XL2.16b, XL2.16b, XL.16b 667 eor XM2.16b, XM2.16b, XM.16b 668 669 ext T2.16b, TT3.16b, TT3.16b, #8 670.Lgh2: eor TT3.16b, TT3.16b, T2.16b 671 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 672 pmull XL.1q, HH.1d, T2.1d // a0 * b0 673 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 674 675 eor XH2.16b, XH2.16b, XH.16b 676 eor XL2.16b, XL2.16b, XL.16b 677 eor XM2.16b, XM2.16b, XM.16b 678 679 ext IN1.16b, TT4.16b, TT4.16b, #8 680.Lgh1: eor TT4.16b, TT4.16b, IN1.16b 681 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 682 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 683 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 684 685 eor XH.16b, XH.16b, XH2.16b 686 eor XL.16b, XL.16b, XL2.16b 687 eor XM.16b, XM.16b, XM2.16b 688 689 eor T2.16b, XL.16b, XH.16b 690 ext T1.16b, XL.16b, XH.16b, #8 691 eor XM.16b, XM.16b, T2.16b 692 693 __pmull_reduce_p64 694 695 eor T2.16b, T2.16b, XH.16b 696 eor XL.16b, XL.16b, T2.16b 697 698 ret 699SYM_FUNC_END(pmull_gcm_ghash_4x) 700 701SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) 702 ld1 {KS0.16b}, [x5] // load upper counter 703 sub w10, w8, #4 704 sub w11, w8, #3 705 sub w12, w8, #2 706 sub w13, w8, #1 707 rev w10, w10 708 rev w11, w11 709 rev w12, w12 710 rev w13, w13 711 mov KS1.16b, KS0.16b 712 mov KS2.16b, KS0.16b 713 mov KS3.16b, KS0.16b 714 ins KS0.s[3], w10 // set lower counter 715 ins KS1.s[3], w11 716 ins KS2.s[3], w12 717 ins KS3.s[3], w13 718 719 add x10, x6, #96 // round key pointer 720 ld1 {K6.4s-K7.4s}, [x10], #32 721 .irp key, K0, K1, K2, K3, K4, K5 722 enc_qround KS0, KS1, KS2, KS3, \key 723 .endr 724 725 tbnz x7, #2, .Lnot128 726 .subsection 1 727.Lnot128: 728 ld1 {K8.4s-K9.4s}, [x10], #32 729 .irp key, K6, K7 730 enc_qround KS0, KS1, KS2, KS3, \key 731 .endr 732 ld1 {K6.4s-K7.4s}, [x10] 733 .irp key, K8, K9 734 enc_qround KS0, KS1, KS2, KS3, \key 735 .endr 736 tbz x7, #1, .Lout192 737 b .Lout256 738 .previous 739 740.Lout256: 741 .irp key, K6, K7 742 enc_qround KS0, KS1, KS2, KS3, \key 743 .endr 744 745.Lout192: 746 enc_qround KS0, KS1, KS2, KS3, KK 747 748 aese KS0.16b, KL.16b 749 aese KS1.16b, KL.16b 750 aese KS2.16b, KL.16b 751 aese KS3.16b, KL.16b 752 753 eor KS0.16b, KS0.16b, KM.16b 754 eor KS1.16b, KS1.16b, KM.16b 755 eor KS2.16b, KS2.16b, KM.16b 756 eor KS3.16b, KS3.16b, KM.16b 757 758 eor INP0.16b, INP0.16b, KS0.16b 759 eor INP1.16b, INP1.16b, KS1.16b 760 eor INP2.16b, INP2.16b, KS2.16b 761 eor INP3.16b, INP3.16b, KS3.16b 762 763 ret 764SYM_FUNC_END(pmull_gcm_enc_4x) 765 766 .section ".rodata", "a" 767 .align 6 768.Lpermute_table: 769 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 770 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 771 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 772 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 773 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 774 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 775 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 776 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 777 .previous