aegis128-aesni-asm.S (14123B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * AES-NI + SSE2 implementation of AEGIS-128 4 * 5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> 6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. 7 */ 8 9#include <linux/linkage.h> 10#include <asm/frame.h> 11 12#define STATE0 %xmm0 13#define STATE1 %xmm1 14#define STATE2 %xmm2 15#define STATE3 %xmm3 16#define STATE4 %xmm4 17#define KEY %xmm5 18#define MSG %xmm5 19#define T0 %xmm6 20#define T1 %xmm7 21 22#define STATEP %rdi 23#define LEN %rsi 24#define SRC %rdx 25#define DST %rcx 26 27.section .rodata.cst16.aegis128_const, "aM", @progbits, 32 28.align 16 29.Laegis128_const_0: 30 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d 31 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 32.Laegis128_const_1: 33 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 34 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd 35 36.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 37.align 16 38.Laegis128_counter: 39 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 40 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 41 42.text 43 44/* 45 * aegis128_update 46 * input: 47 * STATE[0-4] - input state 48 * output: 49 * STATE[0-4] - output state (shifted positions) 50 * changed: 51 * T0 52 */ 53.macro aegis128_update 54 movdqa STATE4, T0 55 aesenc STATE0, STATE4 56 aesenc STATE1, STATE0 57 aesenc STATE2, STATE1 58 aesenc STATE3, STATE2 59 aesenc T0, STATE3 60.endm 61 62/* 63 * __load_partial: internal ABI 64 * input: 65 * LEN - bytes 66 * SRC - src 67 * output: 68 * MSG - message block 69 * changed: 70 * T0 71 * %r8 72 * %r9 73 */ 74SYM_FUNC_START_LOCAL(__load_partial) 75 xor %r9d, %r9d 76 pxor MSG, MSG 77 78 mov LEN, %r8 79 and $0x1, %r8 80 jz .Lld_partial_1 81 82 mov LEN, %r8 83 and $0x1E, %r8 84 add SRC, %r8 85 mov (%r8), %r9b 86 87.Lld_partial_1: 88 mov LEN, %r8 89 and $0x2, %r8 90 jz .Lld_partial_2 91 92 mov LEN, %r8 93 and $0x1C, %r8 94 add SRC, %r8 95 shl $0x10, %r9 96 mov (%r8), %r9w 97 98.Lld_partial_2: 99 mov LEN, %r8 100 and $0x4, %r8 101 jz .Lld_partial_4 102 103 mov LEN, %r8 104 and $0x18, %r8 105 add SRC, %r8 106 shl $32, %r9 107 mov (%r8), %r8d 108 xor %r8, %r9 109 110.Lld_partial_4: 111 movq %r9, MSG 112 113 mov LEN, %r8 114 and $0x8, %r8 115 jz .Lld_partial_8 116 117 mov LEN, %r8 118 and $0x10, %r8 119 add SRC, %r8 120 pslldq $8, MSG 121 movq (%r8), T0 122 pxor T0, MSG 123 124.Lld_partial_8: 125 RET 126SYM_FUNC_END(__load_partial) 127 128/* 129 * __store_partial: internal ABI 130 * input: 131 * LEN - bytes 132 * DST - dst 133 * output: 134 * T0 - message block 135 * changed: 136 * %r8 137 * %r9 138 * %r10 139 */ 140SYM_FUNC_START_LOCAL(__store_partial) 141 mov LEN, %r8 142 mov DST, %r9 143 144 movq T0, %r10 145 146 cmp $8, %r8 147 jl .Lst_partial_8 148 149 mov %r10, (%r9) 150 psrldq $8, T0 151 movq T0, %r10 152 153 sub $8, %r8 154 add $8, %r9 155 156.Lst_partial_8: 157 cmp $4, %r8 158 jl .Lst_partial_4 159 160 mov %r10d, (%r9) 161 shr $32, %r10 162 163 sub $4, %r8 164 add $4, %r9 165 166.Lst_partial_4: 167 cmp $2, %r8 168 jl .Lst_partial_2 169 170 mov %r10w, (%r9) 171 shr $0x10, %r10 172 173 sub $2, %r8 174 add $2, %r9 175 176.Lst_partial_2: 177 cmp $1, %r8 178 jl .Lst_partial_1 179 180 mov %r10b, (%r9) 181 182.Lst_partial_1: 183 RET 184SYM_FUNC_END(__store_partial) 185 186/* 187 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); 188 */ 189SYM_FUNC_START(crypto_aegis128_aesni_init) 190 FRAME_BEGIN 191 192 /* load IV: */ 193 movdqu (%rdx), T1 194 195 /* load key: */ 196 movdqa (%rsi), KEY 197 pxor KEY, T1 198 movdqa T1, STATE0 199 movdqa KEY, STATE3 200 movdqa KEY, STATE4 201 202 /* load the constants: */ 203 movdqa .Laegis128_const_0, STATE2 204 movdqa .Laegis128_const_1, STATE1 205 pxor STATE2, STATE3 206 pxor STATE1, STATE4 207 208 /* update 10 times with KEY / KEY xor IV: */ 209 aegis128_update; pxor KEY, STATE4 210 aegis128_update; pxor T1, STATE3 211 aegis128_update; pxor KEY, STATE2 212 aegis128_update; pxor T1, STATE1 213 aegis128_update; pxor KEY, STATE0 214 aegis128_update; pxor T1, STATE4 215 aegis128_update; pxor KEY, STATE3 216 aegis128_update; pxor T1, STATE2 217 aegis128_update; pxor KEY, STATE1 218 aegis128_update; pxor T1, STATE0 219 220 /* store the state: */ 221 movdqu STATE0, 0x00(STATEP) 222 movdqu STATE1, 0x10(STATEP) 223 movdqu STATE2, 0x20(STATEP) 224 movdqu STATE3, 0x30(STATEP) 225 movdqu STATE4, 0x40(STATEP) 226 227 FRAME_END 228 RET 229SYM_FUNC_END(crypto_aegis128_aesni_init) 230 231/* 232 * void crypto_aegis128_aesni_ad(void *state, unsigned int length, 233 * const void *data); 234 */ 235SYM_FUNC_START(crypto_aegis128_aesni_ad) 236 FRAME_BEGIN 237 238 cmp $0x10, LEN 239 jb .Lad_out 240 241 /* load the state: */ 242 movdqu 0x00(STATEP), STATE0 243 movdqu 0x10(STATEP), STATE1 244 movdqu 0x20(STATEP), STATE2 245 movdqu 0x30(STATEP), STATE3 246 movdqu 0x40(STATEP), STATE4 247 248 mov SRC, %r8 249 and $0xF, %r8 250 jnz .Lad_u_loop 251 252.align 8 253.Lad_a_loop: 254 movdqa 0x00(SRC), MSG 255 aegis128_update 256 pxor MSG, STATE4 257 sub $0x10, LEN 258 cmp $0x10, LEN 259 jl .Lad_out_1 260 261 movdqa 0x10(SRC), MSG 262 aegis128_update 263 pxor MSG, STATE3 264 sub $0x10, LEN 265 cmp $0x10, LEN 266 jl .Lad_out_2 267 268 movdqa 0x20(SRC), MSG 269 aegis128_update 270 pxor MSG, STATE2 271 sub $0x10, LEN 272 cmp $0x10, LEN 273 jl .Lad_out_3 274 275 movdqa 0x30(SRC), MSG 276 aegis128_update 277 pxor MSG, STATE1 278 sub $0x10, LEN 279 cmp $0x10, LEN 280 jl .Lad_out_4 281 282 movdqa 0x40(SRC), MSG 283 aegis128_update 284 pxor MSG, STATE0 285 sub $0x10, LEN 286 cmp $0x10, LEN 287 jl .Lad_out_0 288 289 add $0x50, SRC 290 jmp .Lad_a_loop 291 292.align 8 293.Lad_u_loop: 294 movdqu 0x00(SRC), MSG 295 aegis128_update 296 pxor MSG, STATE4 297 sub $0x10, LEN 298 cmp $0x10, LEN 299 jl .Lad_out_1 300 301 movdqu 0x10(SRC), MSG 302 aegis128_update 303 pxor MSG, STATE3 304 sub $0x10, LEN 305 cmp $0x10, LEN 306 jl .Lad_out_2 307 308 movdqu 0x20(SRC), MSG 309 aegis128_update 310 pxor MSG, STATE2 311 sub $0x10, LEN 312 cmp $0x10, LEN 313 jl .Lad_out_3 314 315 movdqu 0x30(SRC), MSG 316 aegis128_update 317 pxor MSG, STATE1 318 sub $0x10, LEN 319 cmp $0x10, LEN 320 jl .Lad_out_4 321 322 movdqu 0x40(SRC), MSG 323 aegis128_update 324 pxor MSG, STATE0 325 sub $0x10, LEN 326 cmp $0x10, LEN 327 jl .Lad_out_0 328 329 add $0x50, SRC 330 jmp .Lad_u_loop 331 332 /* store the state: */ 333.Lad_out_0: 334 movdqu STATE0, 0x00(STATEP) 335 movdqu STATE1, 0x10(STATEP) 336 movdqu STATE2, 0x20(STATEP) 337 movdqu STATE3, 0x30(STATEP) 338 movdqu STATE4, 0x40(STATEP) 339 FRAME_END 340 RET 341 342.Lad_out_1: 343 movdqu STATE4, 0x00(STATEP) 344 movdqu STATE0, 0x10(STATEP) 345 movdqu STATE1, 0x20(STATEP) 346 movdqu STATE2, 0x30(STATEP) 347 movdqu STATE3, 0x40(STATEP) 348 FRAME_END 349 RET 350 351.Lad_out_2: 352 movdqu STATE3, 0x00(STATEP) 353 movdqu STATE4, 0x10(STATEP) 354 movdqu STATE0, 0x20(STATEP) 355 movdqu STATE1, 0x30(STATEP) 356 movdqu STATE2, 0x40(STATEP) 357 FRAME_END 358 RET 359 360.Lad_out_3: 361 movdqu STATE2, 0x00(STATEP) 362 movdqu STATE3, 0x10(STATEP) 363 movdqu STATE4, 0x20(STATEP) 364 movdqu STATE0, 0x30(STATEP) 365 movdqu STATE1, 0x40(STATEP) 366 FRAME_END 367 RET 368 369.Lad_out_4: 370 movdqu STATE1, 0x00(STATEP) 371 movdqu STATE2, 0x10(STATEP) 372 movdqu STATE3, 0x20(STATEP) 373 movdqu STATE4, 0x30(STATEP) 374 movdqu STATE0, 0x40(STATEP) 375 FRAME_END 376 RET 377 378.Lad_out: 379 FRAME_END 380 RET 381SYM_FUNC_END(crypto_aegis128_aesni_ad) 382 383.macro encrypt_block a s0 s1 s2 s3 s4 i 384 movdq\a (\i * 0x10)(SRC), MSG 385 movdqa MSG, T0 386 pxor \s1, T0 387 pxor \s4, T0 388 movdqa \s2, T1 389 pand \s3, T1 390 pxor T1, T0 391 movdq\a T0, (\i * 0x10)(DST) 392 393 aegis128_update 394 pxor MSG, \s4 395 396 sub $0x10, LEN 397 cmp $0x10, LEN 398 jl .Lenc_out_\i 399.endm 400 401/* 402 * void crypto_aegis128_aesni_enc(void *state, unsigned int length, 403 * const void *src, void *dst); 404 */ 405SYM_FUNC_START(crypto_aegis128_aesni_enc) 406 FRAME_BEGIN 407 408 cmp $0x10, LEN 409 jb .Lenc_out 410 411 /* load the state: */ 412 movdqu 0x00(STATEP), STATE0 413 movdqu 0x10(STATEP), STATE1 414 movdqu 0x20(STATEP), STATE2 415 movdqu 0x30(STATEP), STATE3 416 movdqu 0x40(STATEP), STATE4 417 418 mov SRC, %r8 419 or DST, %r8 420 and $0xF, %r8 421 jnz .Lenc_u_loop 422 423.align 8 424.Lenc_a_loop: 425 encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 426 encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 427 encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 428 encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 429 encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 430 431 add $0x50, SRC 432 add $0x50, DST 433 jmp .Lenc_a_loop 434 435.align 8 436.Lenc_u_loop: 437 encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 438 encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 439 encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 440 encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 441 encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 442 443 add $0x50, SRC 444 add $0x50, DST 445 jmp .Lenc_u_loop 446 447 /* store the state: */ 448.Lenc_out_0: 449 movdqu STATE4, 0x00(STATEP) 450 movdqu STATE0, 0x10(STATEP) 451 movdqu STATE1, 0x20(STATEP) 452 movdqu STATE2, 0x30(STATEP) 453 movdqu STATE3, 0x40(STATEP) 454 FRAME_END 455 RET 456 457.Lenc_out_1: 458 movdqu STATE3, 0x00(STATEP) 459 movdqu STATE4, 0x10(STATEP) 460 movdqu STATE0, 0x20(STATEP) 461 movdqu STATE1, 0x30(STATEP) 462 movdqu STATE2, 0x40(STATEP) 463 FRAME_END 464 RET 465 466.Lenc_out_2: 467 movdqu STATE2, 0x00(STATEP) 468 movdqu STATE3, 0x10(STATEP) 469 movdqu STATE4, 0x20(STATEP) 470 movdqu STATE0, 0x30(STATEP) 471 movdqu STATE1, 0x40(STATEP) 472 FRAME_END 473 RET 474 475.Lenc_out_3: 476 movdqu STATE1, 0x00(STATEP) 477 movdqu STATE2, 0x10(STATEP) 478 movdqu STATE3, 0x20(STATEP) 479 movdqu STATE4, 0x30(STATEP) 480 movdqu STATE0, 0x40(STATEP) 481 FRAME_END 482 RET 483 484.Lenc_out_4: 485 movdqu STATE0, 0x00(STATEP) 486 movdqu STATE1, 0x10(STATEP) 487 movdqu STATE2, 0x20(STATEP) 488 movdqu STATE3, 0x30(STATEP) 489 movdqu STATE4, 0x40(STATEP) 490 FRAME_END 491 RET 492 493.Lenc_out: 494 FRAME_END 495 RET 496SYM_FUNC_END(crypto_aegis128_aesni_enc) 497 498/* 499 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, 500 * const void *src, void *dst); 501 */ 502SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) 503 FRAME_BEGIN 504 505 /* load the state: */ 506 movdqu 0x00(STATEP), STATE0 507 movdqu 0x10(STATEP), STATE1 508 movdqu 0x20(STATEP), STATE2 509 movdqu 0x30(STATEP), STATE3 510 movdqu 0x40(STATEP), STATE4 511 512 /* encrypt message: */ 513 call __load_partial 514 515 movdqa MSG, T0 516 pxor STATE1, T0 517 pxor STATE4, T0 518 movdqa STATE2, T1 519 pand STATE3, T1 520 pxor T1, T0 521 522 call __store_partial 523 524 aegis128_update 525 pxor MSG, STATE4 526 527 /* store the state: */ 528 movdqu STATE4, 0x00(STATEP) 529 movdqu STATE0, 0x10(STATEP) 530 movdqu STATE1, 0x20(STATEP) 531 movdqu STATE2, 0x30(STATEP) 532 movdqu STATE3, 0x40(STATEP) 533 534 FRAME_END 535 RET 536SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) 537 538.macro decrypt_block a s0 s1 s2 s3 s4 i 539 movdq\a (\i * 0x10)(SRC), MSG 540 pxor \s1, MSG 541 pxor \s4, MSG 542 movdqa \s2, T1 543 pand \s3, T1 544 pxor T1, MSG 545 movdq\a MSG, (\i * 0x10)(DST) 546 547 aegis128_update 548 pxor MSG, \s4 549 550 sub $0x10, LEN 551 cmp $0x10, LEN 552 jl .Ldec_out_\i 553.endm 554 555/* 556 * void crypto_aegis128_aesni_dec(void *state, unsigned int length, 557 * const void *src, void *dst); 558 */ 559SYM_FUNC_START(crypto_aegis128_aesni_dec) 560 FRAME_BEGIN 561 562 cmp $0x10, LEN 563 jb .Ldec_out 564 565 /* load the state: */ 566 movdqu 0x00(STATEP), STATE0 567 movdqu 0x10(STATEP), STATE1 568 movdqu 0x20(STATEP), STATE2 569 movdqu 0x30(STATEP), STATE3 570 movdqu 0x40(STATEP), STATE4 571 572 mov SRC, %r8 573 or DST, %r8 574 and $0xF, %r8 575 jnz .Ldec_u_loop 576 577.align 8 578.Ldec_a_loop: 579 decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 580 decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 581 decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 582 decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 583 decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 584 585 add $0x50, SRC 586 add $0x50, DST 587 jmp .Ldec_a_loop 588 589.align 8 590.Ldec_u_loop: 591 decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 592 decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 593 decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 594 decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 595 decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 596 597 add $0x50, SRC 598 add $0x50, DST 599 jmp .Ldec_u_loop 600 601 /* store the state: */ 602.Ldec_out_0: 603 movdqu STATE4, 0x00(STATEP) 604 movdqu STATE0, 0x10(STATEP) 605 movdqu STATE1, 0x20(STATEP) 606 movdqu STATE2, 0x30(STATEP) 607 movdqu STATE3, 0x40(STATEP) 608 FRAME_END 609 RET 610 611.Ldec_out_1: 612 movdqu STATE3, 0x00(STATEP) 613 movdqu STATE4, 0x10(STATEP) 614 movdqu STATE0, 0x20(STATEP) 615 movdqu STATE1, 0x30(STATEP) 616 movdqu STATE2, 0x40(STATEP) 617 FRAME_END 618 RET 619 620.Ldec_out_2: 621 movdqu STATE2, 0x00(STATEP) 622 movdqu STATE3, 0x10(STATEP) 623 movdqu STATE4, 0x20(STATEP) 624 movdqu STATE0, 0x30(STATEP) 625 movdqu STATE1, 0x40(STATEP) 626 FRAME_END 627 RET 628 629.Ldec_out_3: 630 movdqu STATE1, 0x00(STATEP) 631 movdqu STATE2, 0x10(STATEP) 632 movdqu STATE3, 0x20(STATEP) 633 movdqu STATE4, 0x30(STATEP) 634 movdqu STATE0, 0x40(STATEP) 635 FRAME_END 636 RET 637 638.Ldec_out_4: 639 movdqu STATE0, 0x00(STATEP) 640 movdqu STATE1, 0x10(STATEP) 641 movdqu STATE2, 0x20(STATEP) 642 movdqu STATE3, 0x30(STATEP) 643 movdqu STATE4, 0x40(STATEP) 644 FRAME_END 645 RET 646 647.Ldec_out: 648 FRAME_END 649 RET 650SYM_FUNC_END(crypto_aegis128_aesni_dec) 651 652/* 653 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, 654 * const void *src, void *dst); 655 */ 656SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) 657 FRAME_BEGIN 658 659 /* load the state: */ 660 movdqu 0x00(STATEP), STATE0 661 movdqu 0x10(STATEP), STATE1 662 movdqu 0x20(STATEP), STATE2 663 movdqu 0x30(STATEP), STATE3 664 movdqu 0x40(STATEP), STATE4 665 666 /* decrypt message: */ 667 call __load_partial 668 669 pxor STATE1, MSG 670 pxor STATE4, MSG 671 movdqa STATE2, T1 672 pand STATE3, T1 673 pxor T1, MSG 674 675 movdqa MSG, T0 676 call __store_partial 677 678 /* mask with byte count: */ 679 movq LEN, T0 680 punpcklbw T0, T0 681 punpcklbw T0, T0 682 punpcklbw T0, T0 683 punpcklbw T0, T0 684 movdqa .Laegis128_counter, T1 685 pcmpgtb T1, T0 686 pand T0, MSG 687 688 aegis128_update 689 pxor MSG, STATE4 690 691 /* store the state: */ 692 movdqu STATE4, 0x00(STATEP) 693 movdqu STATE0, 0x10(STATEP) 694 movdqu STATE1, 0x20(STATEP) 695 movdqu STATE2, 0x30(STATEP) 696 movdqu STATE3, 0x40(STATEP) 697 698 FRAME_END 699 RET 700SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) 701 702/* 703 * void crypto_aegis128_aesni_final(void *state, void *tag_xor, 704 * u64 assoclen, u64 cryptlen); 705 */ 706SYM_FUNC_START(crypto_aegis128_aesni_final) 707 FRAME_BEGIN 708 709 /* load the state: */ 710 movdqu 0x00(STATEP), STATE0 711 movdqu 0x10(STATEP), STATE1 712 movdqu 0x20(STATEP), STATE2 713 movdqu 0x30(STATEP), STATE3 714 movdqu 0x40(STATEP), STATE4 715 716 /* prepare length block: */ 717 movq %rdx, MSG 718 movq %rcx, T0 719 pslldq $8, T0 720 pxor T0, MSG 721 psllq $3, MSG /* multiply by 8 (to get bit count) */ 722 723 pxor STATE3, MSG 724 725 /* update state: */ 726 aegis128_update; pxor MSG, STATE4 727 aegis128_update; pxor MSG, STATE3 728 aegis128_update; pxor MSG, STATE2 729 aegis128_update; pxor MSG, STATE1 730 aegis128_update; pxor MSG, STATE0 731 aegis128_update; pxor MSG, STATE4 732 aegis128_update; pxor MSG, STATE3 733 734 /* xor tag: */ 735 movdqu (%rsi), MSG 736 737 pxor STATE0, MSG 738 pxor STATE1, MSG 739 pxor STATE2, MSG 740 pxor STATE3, MSG 741 pxor STATE4, MSG 742 743 movdqu MSG, (%rsi) 744 745 FRAME_END 746 RET 747SYM_FUNC_END(crypto_aegis128_aesni_final)