aesni-intel_avx-x86_64.S (100008B)
1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123 124# constants in mergeable sections, linker can reorder and merge 125.section .rodata.cst16.POLY, "aM", @progbits, 16 126.align 16 127POLY: .octa 0xC2000000000000000000000000000001 128 129.section .rodata.cst16.POLY2, "aM", @progbits, 16 130.align 16 131POLY2: .octa 0xC20000000000000000000001C2000000 132 133.section .rodata.cst16.TWOONE, "aM", @progbits, 16 134.align 16 135TWOONE: .octa 0x00000001000000000000000000000001 136 137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 138.align 16 139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 140 141.section .rodata.cst16.ONE, "aM", @progbits, 16 142.align 16 143ONE: .octa 0x00000000000000000000000000000001 144 145.section .rodata.cst16.ONEf, "aM", @progbits, 16 146.align 16 147ONEf: .octa 0x01000000000000000000000000000000 148 149# order of these constants should not change. 150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 151.section .rodata, "a", @progbits 152.align 16 153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 154ALL_F: .octa 0xffffffffffffffffffffffffffffffff 155 .octa 0x00000000000000000000000000000000 156 157.section .rodata 158.align 16 159.type aad_shift_arr, @object 160.size aad_shift_arr, 272 161aad_shift_arr: 162 .octa 0xffffffffffffffffffffffffffffffff 163 .octa 0xffffffffffffffffffffffffffffff0C 164 .octa 0xffffffffffffffffffffffffffff0D0C 165 .octa 0xffffffffffffffffffffffffff0E0D0C 166 .octa 0xffffffffffffffffffffffff0F0E0D0C 167 .octa 0xffffffffffffffffffffff0C0B0A0908 168 .octa 0xffffffffffffffffffff0D0C0B0A0908 169 .octa 0xffffffffffffffffff0E0D0C0B0A0908 170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 171 .octa 0xffffffffffffff0C0B0A090807060504 172 .octa 0xffffffffffff0D0C0B0A090807060504 173 .octa 0xffffffffff0E0D0C0B0A090807060504 174 .octa 0xffffffff0F0E0D0C0B0A090807060504 175 .octa 0xffffff0C0B0A09080706050403020100 176 .octa 0xffff0D0C0B0A09080706050403020100 177 .octa 0xff0E0D0C0B0A09080706050403020100 178 .octa 0x0F0E0D0C0B0A09080706050403020100 179 180 181.text 182 183 184#define AadHash 16*0 185#define AadLen 16*1 186#define InLen (16*1)+8 187#define PBlockEncKey 16*2 188#define OrigIV 16*3 189#define CurCount 16*4 190#define PBlockLen 16*5 191 192HashKey = 16*6 # store HashKey <<1 mod poly here 193HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 194HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 195HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 196HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 197HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 198HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 199HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 200HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 201HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 202HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 203HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 204HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 205HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 206HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 207HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 208 209#define arg1 %rdi 210#define arg2 %rsi 211#define arg3 %rdx 212#define arg4 %rcx 213#define arg5 %r8 214#define arg6 %r9 215#define keysize 2*15*16(arg1) 216 217i = 0 218j = 0 219 220out_order = 0 221in_order = 1 222DEC = 0 223ENC = 1 224 225.macro define_reg r n 226reg_\r = %xmm\n 227.endm 228 229.macro setreg 230.altmacro 231define_reg i %i 232define_reg j %j 233.noaltmacro 234.endm 235 236TMP1 = 16*0 # Temporary storage for AAD 237TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 238TMP3 = 16*2 # Temporary storage for AES State 3 239TMP4 = 16*3 # Temporary storage for AES State 4 240TMP5 = 16*4 # Temporary storage for AES State 5 241TMP6 = 16*5 # Temporary storage for AES State 6 242TMP7 = 16*6 # Temporary storage for AES State 7 243TMP8 = 16*7 # Temporary storage for AES State 8 244 245VARIABLE_OFFSET = 16*8 246 247################################ 248# Utility Macros 249################################ 250 251.macro FUNC_SAVE 252 push %r12 253 push %r13 254 push %r15 255 256 push %rbp 257 mov %rsp, %rbp 258 259 sub $VARIABLE_OFFSET, %rsp 260 and $~63, %rsp # align rsp to 64 bytes 261.endm 262 263.macro FUNC_RESTORE 264 mov %rbp, %rsp 265 pop %rbp 266 267 pop %r15 268 pop %r13 269 pop %r12 270.endm 271 272# Encryption of a single block 273.macro ENCRYPT_SINGLE_BLOCK REP XMM0 274 vpxor (arg1), \XMM0, \XMM0 275 i = 1 276 setreg 277.rep \REP 278 vaesenc 16*i(arg1), \XMM0, \XMM0 279 i = (i+1) 280 setreg 281.endr 282 vaesenclast 16*i(arg1), \XMM0, \XMM0 283.endm 284 285# combined for GCM encrypt and decrypt functions 286# clobbering all xmm registers 287# clobbering r10, r11, r12, r13, r15, rax 288.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 289 vmovdqu AadHash(arg2), %xmm8 290 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 291 add arg5, InLen(arg2) 292 293 # initialize the data pointer offset as zero 294 xor %r11d, %r11d 295 296 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 297 sub %r11, arg5 298 299 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 300 and $-16, %r13 # r13 = r13 - (r13 mod 16) 301 302 mov %r13, %r12 303 shr $4, %r12 304 and $7, %r12 305 jz _initial_num_blocks_is_0\@ 306 307 cmp $7, %r12 308 je _initial_num_blocks_is_7\@ 309 cmp $6, %r12 310 je _initial_num_blocks_is_6\@ 311 cmp $5, %r12 312 je _initial_num_blocks_is_5\@ 313 cmp $4, %r12 314 je _initial_num_blocks_is_4\@ 315 cmp $3, %r12 316 je _initial_num_blocks_is_3\@ 317 cmp $2, %r12 318 je _initial_num_blocks_is_2\@ 319 320 jmp _initial_num_blocks_is_1\@ 321 322_initial_num_blocks_is_7\@: 323 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 324 sub $16*7, %r13 325 jmp _initial_blocks_encrypted\@ 326 327_initial_num_blocks_is_6\@: 328 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 329 sub $16*6, %r13 330 jmp _initial_blocks_encrypted\@ 331 332_initial_num_blocks_is_5\@: 333 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 334 sub $16*5, %r13 335 jmp _initial_blocks_encrypted\@ 336 337_initial_num_blocks_is_4\@: 338 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 339 sub $16*4, %r13 340 jmp _initial_blocks_encrypted\@ 341 342_initial_num_blocks_is_3\@: 343 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 344 sub $16*3, %r13 345 jmp _initial_blocks_encrypted\@ 346 347_initial_num_blocks_is_2\@: 348 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 349 sub $16*2, %r13 350 jmp _initial_blocks_encrypted\@ 351 352_initial_num_blocks_is_1\@: 353 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 354 sub $16*1, %r13 355 jmp _initial_blocks_encrypted\@ 356 357_initial_num_blocks_is_0\@: 358 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 359 360 361_initial_blocks_encrypted\@: 362 test %r13, %r13 363 je _zero_cipher_left\@ 364 365 sub $128, %r13 366 je _eight_cipher_left\@ 367 368 369 370 371 vmovd %xmm9, %r15d 372 and $255, %r15d 373 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 374 375 376_encrypt_by_8_new\@: 377 cmp $(255-8), %r15d 378 jg _encrypt_by_8\@ 379 380 381 382 add $8, %r15b 383 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 384 add $128, %r11 385 sub $128, %r13 386 jne _encrypt_by_8_new\@ 387 388 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 389 jmp _eight_cipher_left\@ 390 391_encrypt_by_8\@: 392 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 393 add $8, %r15b 394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 395 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 396 add $128, %r11 397 sub $128, %r13 398 jne _encrypt_by_8_new\@ 399 400 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 401 402 403 404 405_eight_cipher_left\@: 406 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 407 408 409_zero_cipher_left\@: 410 vmovdqu %xmm14, AadHash(arg2) 411 vmovdqu %xmm9, CurCount(arg2) 412 413 # check for 0 length 414 mov arg5, %r13 415 and $15, %r13 # r13 = (arg5 mod 16) 416 417 je _multiple_of_16_bytes\@ 418 419 # handle the last <16 Byte block separately 420 421 mov %r13, PBlockLen(arg2) 422 423 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 424 vmovdqu %xmm9, CurCount(arg2) 425 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 426 427 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 428 vmovdqu %xmm9, PBlockEncKey(arg2) 429 430 cmp $16, arg5 431 jge _large_enough_update\@ 432 433 lea (arg4,%r11,1), %r10 434 mov %r13, %r12 435 436 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 437 438 lea SHIFT_MASK+16(%rip), %r12 439 sub %r13, %r12 # adjust the shuffle mask pointer to be 440 # able to shift 16-r13 bytes (r13 is the 441 # number of bytes in plaintext mod 16) 442 443 jmp _final_ghash_mul\@ 444 445_large_enough_update\@: 446 sub $16, %r11 447 add %r13, %r11 448 449 # receive the last <16 Byte block 450 vmovdqu (arg4, %r11, 1), %xmm1 451 452 sub %r13, %r11 453 add $16, %r11 454 455 lea SHIFT_MASK+16(%rip), %r12 456 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 457 # (r13 is the number of bytes in plaintext mod 16) 458 sub %r13, %r12 459 # get the appropriate shuffle mask 460 vmovdqu (%r12), %xmm2 461 # shift right 16-r13 bytes 462 vpshufb %xmm2, %xmm1, %xmm1 463 464_final_ghash_mul\@: 465 .if \ENC_DEC == DEC 466 vmovdqa %xmm1, %xmm2 467 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 468 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 469 # mask out top 16-r13 bytes of xmm9 470 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 471 vpand %xmm1, %xmm2, %xmm2 472 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 473 vpxor %xmm2, %xmm14, %xmm14 474 475 vmovdqu %xmm14, AadHash(arg2) 476 .else 477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 479 # mask out top 16-r13 bytes of xmm9 480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 481 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 482 vpxor %xmm9, %xmm14, %xmm14 483 484 vmovdqu %xmm14, AadHash(arg2) 485 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 486 .endif 487 488 489 ############################# 490 # output r13 Bytes 491 vmovq %xmm9, %rax 492 cmp $8, %r13 493 jle _less_than_8_bytes_left\@ 494 495 mov %rax, (arg3 , %r11) 496 add $8, %r11 497 vpsrldq $8, %xmm9, %xmm9 498 vmovq %xmm9, %rax 499 sub $8, %r13 500 501_less_than_8_bytes_left\@: 502 movb %al, (arg3 , %r11) 503 add $1, %r11 504 shr $8, %rax 505 sub $1, %r13 506 jne _less_than_8_bytes_left\@ 507 ############################# 508 509_multiple_of_16_bytes\@: 510.endm 511 512 513# GCM_COMPLETE Finishes update of tag of last partial block 514# Output: Authorization Tag (AUTH_TAG) 515# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 516.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 517 vmovdqu AadHash(arg2), %xmm14 518 vmovdqu HashKey(arg2), %xmm13 519 520 mov PBlockLen(arg2), %r12 521 test %r12, %r12 522 je _partial_done\@ 523 524 #GHASH computation for the last <16 Byte block 525 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 526 527_partial_done\@: 528 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 529 shl $3, %r12 # convert into number of bits 530 vmovd %r12d, %xmm15 # len(A) in xmm15 531 532 mov InLen(arg2), %r12 533 shl $3, %r12 # len(C) in bits (*128) 534 vmovq %r12, %xmm1 535 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 536 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 537 538 vpxor %xmm15, %xmm14, %xmm14 539 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 540 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 541 542 vmovdqu OrigIV(arg2), %xmm9 543 544 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 545 546 vpxor %xmm14, %xmm9, %xmm9 547 548 549 550_return_T\@: 551 mov \AUTH_TAG, %r10 # r10 = authTag 552 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 553 554 cmp $16, %r11 555 je _T_16\@ 556 557 cmp $8, %r11 558 jl _T_4\@ 559 560_T_8\@: 561 vmovq %xmm9, %rax 562 mov %rax, (%r10) 563 add $8, %r10 564 sub $8, %r11 565 vpsrldq $8, %xmm9, %xmm9 566 test %r11, %r11 567 je _return_T_done\@ 568_T_4\@: 569 vmovd %xmm9, %eax 570 mov %eax, (%r10) 571 add $4, %r10 572 sub $4, %r11 573 vpsrldq $4, %xmm9, %xmm9 574 test %r11, %r11 575 je _return_T_done\@ 576_T_123\@: 577 vmovd %xmm9, %eax 578 cmp $2, %r11 579 jl _T_1\@ 580 mov %ax, (%r10) 581 cmp $2, %r11 582 je _return_T_done\@ 583 add $2, %r10 584 sar $16, %eax 585_T_1\@: 586 mov %al, (%r10) 587 jmp _return_T_done\@ 588 589_T_16\@: 590 vmovdqu %xmm9, (%r10) 591 592_return_T_done\@: 593.endm 594 595.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 596 597 mov \AAD, %r10 # r10 = AAD 598 mov \AADLEN, %r12 # r12 = aadLen 599 600 601 mov %r12, %r11 602 603 vpxor \T8, \T8, \T8 604 vpxor \T7, \T7, \T7 605 cmp $16, %r11 606 jl _get_AAD_rest8\@ 607_get_AAD_blocks\@: 608 vmovdqu (%r10), \T7 609 vpshufb SHUF_MASK(%rip), \T7, \T7 610 vpxor \T7, \T8, \T8 611 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 612 add $16, %r10 613 sub $16, %r12 614 sub $16, %r11 615 cmp $16, %r11 616 jge _get_AAD_blocks\@ 617 vmovdqu \T8, \T7 618 test %r11, %r11 619 je _get_AAD_done\@ 620 621 vpxor \T7, \T7, \T7 622 623 /* read the last <16B of AAD. since we have at least 4B of 624 data right after the AAD (the ICV, and maybe some CT), we can 625 read 4B/8B blocks safely, and then get rid of the extra stuff */ 626_get_AAD_rest8\@: 627 cmp $4, %r11 628 jle _get_AAD_rest4\@ 629 movq (%r10), \T1 630 add $8, %r10 631 sub $8, %r11 632 vpslldq $8, \T1, \T1 633 vpsrldq $8, \T7, \T7 634 vpxor \T1, \T7, \T7 635 jmp _get_AAD_rest8\@ 636_get_AAD_rest4\@: 637 test %r11, %r11 638 jle _get_AAD_rest0\@ 639 mov (%r10), %eax 640 movq %rax, \T1 641 add $4, %r10 642 sub $4, %r11 643 vpslldq $12, \T1, \T1 644 vpsrldq $4, \T7, \T7 645 vpxor \T1, \T7, \T7 646_get_AAD_rest0\@: 647 /* finalize: shift out the extra bytes we read, and align 648 left. since pslldq can only shift by an immediate, we use 649 vpshufb and an array of shuffle masks */ 650 movq %r12, %r11 651 salq $4, %r11 652 vmovdqu aad_shift_arr(%r11), \T1 653 vpshufb \T1, \T7, \T7 654_get_AAD_rest_final\@: 655 vpshufb SHUF_MASK(%rip), \T7, \T7 656 vpxor \T8, \T7, \T7 657 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 658 659_get_AAD_done\@: 660 vmovdqu \T7, AadHash(arg2) 661.endm 662 663.macro INIT GHASH_MUL PRECOMPUTE 664 mov arg6, %r11 665 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 666 xor %r11d, %r11d 667 mov %r11, InLen(arg2) # ctx_data.in_length = 0 668 669 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 670 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 671 mov arg3, %rax 672 movdqu (%rax), %xmm0 673 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 674 675 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 676 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 677 678 vmovdqu (arg4), %xmm6 # xmm6 = HashKey 679 680 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 681 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 682 vmovdqa %xmm6, %xmm2 683 vpsllq $1, %xmm6, %xmm6 684 vpsrlq $63, %xmm2, %xmm2 685 vmovdqa %xmm2, %xmm1 686 vpslldq $8, %xmm2, %xmm2 687 vpsrldq $8, %xmm1, %xmm1 688 vpor %xmm2, %xmm6, %xmm6 689 #reduction 690 vpshufd $0b00100100, %xmm1, %xmm2 691 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 692 vpand POLY(%rip), %xmm2, %xmm2 693 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 694 ####################################################################### 695 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 696 697 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 698 699 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 700.endm 701 702 703# Reads DLEN bytes starting at DPTR and stores in XMMDst 704# where 0 < DLEN < 16 705# Clobbers %rax, DLEN 706.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 707 vpxor \XMMDst, \XMMDst, \XMMDst 708 709 cmp $8, \DLEN 710 jl _read_lt8_\@ 711 mov (\DPTR), %rax 712 vpinsrq $0, %rax, \XMMDst, \XMMDst 713 sub $8, \DLEN 714 jz _done_read_partial_block_\@ 715 xor %eax, %eax 716_read_next_byte_\@: 717 shl $8, %rax 718 mov 7(\DPTR, \DLEN, 1), %al 719 dec \DLEN 720 jnz _read_next_byte_\@ 721 vpinsrq $1, %rax, \XMMDst, \XMMDst 722 jmp _done_read_partial_block_\@ 723_read_lt8_\@: 724 xor %eax, %eax 725_read_next_byte_lt8_\@: 726 shl $8, %rax 727 mov -1(\DPTR, \DLEN, 1), %al 728 dec \DLEN 729 jnz _read_next_byte_lt8_\@ 730 vpinsrq $0, %rax, \XMMDst, \XMMDst 731_done_read_partial_block_\@: 732.endm 733 734# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 735# between update calls. 736# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 737# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 738# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 739.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 740 AAD_HASH ENC_DEC 741 mov PBlockLen(arg2), %r13 742 test %r13, %r13 743 je _partial_block_done_\@ # Leave Macro if no partial blocks 744 # Read in input data without over reading 745 cmp $16, \PLAIN_CYPH_LEN 746 jl _fewer_than_16_bytes_\@ 747 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 748 jmp _data_read_\@ 749 750_fewer_than_16_bytes_\@: 751 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 752 mov \PLAIN_CYPH_LEN, %r12 753 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 754 755 mov PBlockLen(arg2), %r13 756 757_data_read_\@: # Finished reading in data 758 759 vmovdqu PBlockEncKey(arg2), %xmm9 760 vmovdqu HashKey(arg2), %xmm13 761 762 lea SHIFT_MASK(%rip), %r12 763 764 # adjust the shuffle mask pointer to be able to shift r13 bytes 765 # r16-r13 is the number of bytes in plaintext mod 16) 766 add %r13, %r12 767 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 768 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 769 770.if \ENC_DEC == DEC 771 vmovdqa %xmm1, %xmm3 772 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 773 774 mov \PLAIN_CYPH_LEN, %r10 775 add %r13, %r10 776 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 777 sub $16, %r10 778 # Determine if if partial block is not being filled and 779 # shift mask accordingly 780 jge _no_extra_mask_1_\@ 781 sub %r10, %r12 782_no_extra_mask_1_\@: 783 784 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 785 # get the appropriate mask to mask out bottom r13 bytes of xmm9 786 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 787 788 vpand %xmm1, %xmm3, %xmm3 789 vmovdqa SHUF_MASK(%rip), %xmm10 790 vpshufb %xmm10, %xmm3, %xmm3 791 vpshufb %xmm2, %xmm3, %xmm3 792 vpxor %xmm3, \AAD_HASH, \AAD_HASH 793 794 test %r10, %r10 795 jl _partial_incomplete_1_\@ 796 797 # GHASH computation for the last <16 Byte block 798 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 799 xor %eax,%eax 800 801 mov %rax, PBlockLen(arg2) 802 jmp _dec_done_\@ 803_partial_incomplete_1_\@: 804 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 805_dec_done_\@: 806 vmovdqu \AAD_HASH, AadHash(arg2) 807.else 808 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 809 810 mov \PLAIN_CYPH_LEN, %r10 811 add %r13, %r10 812 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 813 sub $16, %r10 814 # Determine if if partial block is not being filled and 815 # shift mask accordingly 816 jge _no_extra_mask_2_\@ 817 sub %r10, %r12 818_no_extra_mask_2_\@: 819 820 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 821 # get the appropriate mask to mask out bottom r13 bytes of xmm9 822 vpand %xmm1, %xmm9, %xmm9 823 824 vmovdqa SHUF_MASK(%rip), %xmm1 825 vpshufb %xmm1, %xmm9, %xmm9 826 vpshufb %xmm2, %xmm9, %xmm9 827 vpxor %xmm9, \AAD_HASH, \AAD_HASH 828 829 test %r10, %r10 830 jl _partial_incomplete_2_\@ 831 832 # GHASH computation for the last <16 Byte block 833 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 834 xor %eax,%eax 835 836 mov %rax, PBlockLen(arg2) 837 jmp _encode_done_\@ 838_partial_incomplete_2_\@: 839 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 840_encode_done_\@: 841 vmovdqu \AAD_HASH, AadHash(arg2) 842 843 vmovdqa SHUF_MASK(%rip), %xmm10 844 # shuffle xmm9 back to output as ciphertext 845 vpshufb %xmm10, %xmm9, %xmm9 846 vpshufb %xmm2, %xmm9, %xmm9 847.endif 848 # output encrypted Bytes 849 test %r10, %r10 850 jl _partial_fill_\@ 851 mov %r13, %r12 852 mov $16, %r13 853 # Set r13 to be the number of bytes to write out 854 sub %r12, %r13 855 jmp _count_set_\@ 856_partial_fill_\@: 857 mov \PLAIN_CYPH_LEN, %r13 858_count_set_\@: 859 vmovdqa %xmm9, %xmm0 860 vmovq %xmm0, %rax 861 cmp $8, %r13 862 jle _less_than_8_bytes_left_\@ 863 864 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 865 add $8, \DATA_OFFSET 866 psrldq $8, %xmm0 867 vmovq %xmm0, %rax 868 sub $8, %r13 869_less_than_8_bytes_left_\@: 870 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 871 add $1, \DATA_OFFSET 872 shr $8, %rax 873 sub $1, %r13 874 jne _less_than_8_bytes_left_\@ 875_partial_block_done_\@: 876.endm # PARTIAL_BLOCK 877 878############################################################################### 879# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 880# Input: A and B (128-bits each, bit-reflected) 881# Output: C = A*B*x mod poly, (i.e. >>1 ) 882# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 883# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 884############################################################################### 885.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 886 887 vpshufd $0b01001110, \GH, \T2 888 vpshufd $0b01001110, \HK, \T3 889 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 890 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 891 892 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 893 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 894 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 895 vpxor \GH, \T2,\T2 896 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 897 898 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 899 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 900 vpxor \T3, \GH, \GH 901 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 902 903 #first phase of the reduction 904 vpslld $31, \GH, \T2 # packed right shifting << 31 905 vpslld $30, \GH, \T3 # packed right shifting shift << 30 906 vpslld $25, \GH, \T4 # packed right shifting shift << 25 907 908 vpxor \T3, \T2, \T2 # xor the shifted versions 909 vpxor \T4, \T2, \T2 910 911 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 912 913 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 914 vpxor \T2, \GH, \GH # first phase of the reduction complete 915 916 #second phase of the reduction 917 918 vpsrld $1,\GH, \T2 # packed left shifting >> 1 919 vpsrld $2,\GH, \T3 # packed left shifting >> 2 920 vpsrld $7,\GH, \T4 # packed left shifting >> 7 921 vpxor \T3, \T2, \T2 # xor the shifted versions 922 vpxor \T4, \T2, \T2 923 924 vpxor \T5, \T2, \T2 925 vpxor \T2, \GH, \GH 926 vpxor \T1, \GH, \GH # the result is in GH 927 928 929.endm 930 931.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 932 933 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 934 vmovdqa \HK, \T5 935 936 vpshufd $0b01001110, \T5, \T1 937 vpxor \T5, \T1, \T1 938 vmovdqu \T1, HashKey_k(arg2) 939 940 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 941 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 942 vpshufd $0b01001110, \T5, \T1 943 vpxor \T5, \T1, \T1 944 vmovdqu \T1, HashKey_2_k(arg2) 945 946 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 947 vmovdqu \T5, HashKey_3(arg2) 948 vpshufd $0b01001110, \T5, \T1 949 vpxor \T5, \T1, \T1 950 vmovdqu \T1, HashKey_3_k(arg2) 951 952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 953 vmovdqu \T5, HashKey_4(arg2) 954 vpshufd $0b01001110, \T5, \T1 955 vpxor \T5, \T1, \T1 956 vmovdqu \T1, HashKey_4_k(arg2) 957 958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 959 vmovdqu \T5, HashKey_5(arg2) 960 vpshufd $0b01001110, \T5, \T1 961 vpxor \T5, \T1, \T1 962 vmovdqu \T1, HashKey_5_k(arg2) 963 964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 965 vmovdqu \T5, HashKey_6(arg2) 966 vpshufd $0b01001110, \T5, \T1 967 vpxor \T5, \T1, \T1 968 vmovdqu \T1, HashKey_6_k(arg2) 969 970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 971 vmovdqu \T5, HashKey_7(arg2) 972 vpshufd $0b01001110, \T5, \T1 973 vpxor \T5, \T1, \T1 974 vmovdqu \T1, HashKey_7_k(arg2) 975 976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 977 vmovdqu \T5, HashKey_8(arg2) 978 vpshufd $0b01001110, \T5, \T1 979 vpxor \T5, \T1, \T1 980 vmovdqu \T1, HashKey_8_k(arg2) 981 982.endm 983 984## if a = number of total plaintext bytes 985## b = floor(a/16) 986## num_initial_blocks = b mod 4# 987## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 988## r10, r11, r12, rax are clobbered 989## arg1, arg2, arg3, arg4 are used as pointers only, not modified 990 991.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 992 i = (8-\num_initial_blocks) 993 setreg 994 vmovdqu AadHash(arg2), reg_i 995 996 # start AES for num_initial_blocks blocks 997 vmovdqu CurCount(arg2), \CTR 998 999 i = (9-\num_initial_blocks) 1000 setreg 1001.rep \num_initial_blocks 1002 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1003 vmovdqa \CTR, reg_i 1004 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1005 i = (i+1) 1006 setreg 1007.endr 1008 1009 vmovdqa (arg1), \T_key 1010 i = (9-\num_initial_blocks) 1011 setreg 1012.rep \num_initial_blocks 1013 vpxor \T_key, reg_i, reg_i 1014 i = (i+1) 1015 setreg 1016.endr 1017 1018 j = 1 1019 setreg 1020.rep \REP 1021 vmovdqa 16*j(arg1), \T_key 1022 i = (9-\num_initial_blocks) 1023 setreg 1024.rep \num_initial_blocks 1025 vaesenc \T_key, reg_i, reg_i 1026 i = (i+1) 1027 setreg 1028.endr 1029 1030 j = (j+1) 1031 setreg 1032.endr 1033 1034 vmovdqa 16*j(arg1), \T_key 1035 i = (9-\num_initial_blocks) 1036 setreg 1037.rep \num_initial_blocks 1038 vaesenclast \T_key, reg_i, reg_i 1039 i = (i+1) 1040 setreg 1041.endr 1042 1043 i = (9-\num_initial_blocks) 1044 setreg 1045.rep \num_initial_blocks 1046 vmovdqu (arg4, %r11), \T1 1047 vpxor \T1, reg_i, reg_i 1048 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1049 add $16, %r11 1050.if \ENC_DEC == DEC 1051 vmovdqa \T1, reg_i 1052.endif 1053 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1054 i = (i+1) 1055 setreg 1056.endr 1057 1058 1059 i = (8-\num_initial_blocks) 1060 j = (9-\num_initial_blocks) 1061 setreg 1062 1063.rep \num_initial_blocks 1064 vpxor reg_i, reg_j, reg_j 1065 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1066 i = (i+1) 1067 j = (j+1) 1068 setreg 1069.endr 1070 # XMM8 has the combined result here 1071 1072 vmovdqa \XMM8, TMP1(%rsp) 1073 vmovdqa \XMM8, \T3 1074 1075 cmp $128, %r13 1076 jl _initial_blocks_done\@ # no need for precomputed constants 1077 1078############################################################################### 1079# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1080 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1081 vmovdqa \CTR, \XMM1 1082 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1083 1084 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1085 vmovdqa \CTR, \XMM2 1086 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1087 1088 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1089 vmovdqa \CTR, \XMM3 1090 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1091 1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1093 vmovdqa \CTR, \XMM4 1094 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1095 1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1097 vmovdqa \CTR, \XMM5 1098 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1099 1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1101 vmovdqa \CTR, \XMM6 1102 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1103 1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1105 vmovdqa \CTR, \XMM7 1106 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1107 1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1109 vmovdqa \CTR, \XMM8 1110 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1111 1112 vmovdqa (arg1), \T_key 1113 vpxor \T_key, \XMM1, \XMM1 1114 vpxor \T_key, \XMM2, \XMM2 1115 vpxor \T_key, \XMM3, \XMM3 1116 vpxor \T_key, \XMM4, \XMM4 1117 vpxor \T_key, \XMM5, \XMM5 1118 vpxor \T_key, \XMM6, \XMM6 1119 vpxor \T_key, \XMM7, \XMM7 1120 vpxor \T_key, \XMM8, \XMM8 1121 1122 i = 1 1123 setreg 1124.rep \REP # do REP rounds 1125 vmovdqa 16*i(arg1), \T_key 1126 vaesenc \T_key, \XMM1, \XMM1 1127 vaesenc \T_key, \XMM2, \XMM2 1128 vaesenc \T_key, \XMM3, \XMM3 1129 vaesenc \T_key, \XMM4, \XMM4 1130 vaesenc \T_key, \XMM5, \XMM5 1131 vaesenc \T_key, \XMM6, \XMM6 1132 vaesenc \T_key, \XMM7, \XMM7 1133 vaesenc \T_key, \XMM8, \XMM8 1134 i = (i+1) 1135 setreg 1136.endr 1137 1138 vmovdqa 16*i(arg1), \T_key 1139 vaesenclast \T_key, \XMM1, \XMM1 1140 vaesenclast \T_key, \XMM2, \XMM2 1141 vaesenclast \T_key, \XMM3, \XMM3 1142 vaesenclast \T_key, \XMM4, \XMM4 1143 vaesenclast \T_key, \XMM5, \XMM5 1144 vaesenclast \T_key, \XMM6, \XMM6 1145 vaesenclast \T_key, \XMM7, \XMM7 1146 vaesenclast \T_key, \XMM8, \XMM8 1147 1148 vmovdqu (arg4, %r11), \T1 1149 vpxor \T1, \XMM1, \XMM1 1150 vmovdqu \XMM1, (arg3 , %r11) 1151 .if \ENC_DEC == DEC 1152 vmovdqa \T1, \XMM1 1153 .endif 1154 1155 vmovdqu 16*1(arg4, %r11), \T1 1156 vpxor \T1, \XMM2, \XMM2 1157 vmovdqu \XMM2, 16*1(arg3 , %r11) 1158 .if \ENC_DEC == DEC 1159 vmovdqa \T1, \XMM2 1160 .endif 1161 1162 vmovdqu 16*2(arg4, %r11), \T1 1163 vpxor \T1, \XMM3, \XMM3 1164 vmovdqu \XMM3, 16*2(arg3 , %r11) 1165 .if \ENC_DEC == DEC 1166 vmovdqa \T1, \XMM3 1167 .endif 1168 1169 vmovdqu 16*3(arg4, %r11), \T1 1170 vpxor \T1, \XMM4, \XMM4 1171 vmovdqu \XMM4, 16*3(arg3 , %r11) 1172 .if \ENC_DEC == DEC 1173 vmovdqa \T1, \XMM4 1174 .endif 1175 1176 vmovdqu 16*4(arg4, %r11), \T1 1177 vpxor \T1, \XMM5, \XMM5 1178 vmovdqu \XMM5, 16*4(arg3 , %r11) 1179 .if \ENC_DEC == DEC 1180 vmovdqa \T1, \XMM5 1181 .endif 1182 1183 vmovdqu 16*5(arg4, %r11), \T1 1184 vpxor \T1, \XMM6, \XMM6 1185 vmovdqu \XMM6, 16*5(arg3 , %r11) 1186 .if \ENC_DEC == DEC 1187 vmovdqa \T1, \XMM6 1188 .endif 1189 1190 vmovdqu 16*6(arg4, %r11), \T1 1191 vpxor \T1, \XMM7, \XMM7 1192 vmovdqu \XMM7, 16*6(arg3 , %r11) 1193 .if \ENC_DEC == DEC 1194 vmovdqa \T1, \XMM7 1195 .endif 1196 1197 vmovdqu 16*7(arg4, %r11), \T1 1198 vpxor \T1, \XMM8, \XMM8 1199 vmovdqu \XMM8, 16*7(arg3 , %r11) 1200 .if \ENC_DEC == DEC 1201 vmovdqa \T1, \XMM8 1202 .endif 1203 1204 add $128, %r11 1205 1206 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1207 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1208 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1209 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1210 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1211 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1212 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1213 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1214 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1215 1216############################################################################### 1217 1218_initial_blocks_done\@: 1219 1220.endm 1221 1222# encrypt 8 blocks at a time 1223# ghash the 8 previously encrypted ciphertext blocks 1224# arg1, arg2, arg3, arg4 are used as pointers only, not modified 1225# r11 is the data offset value 1226.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1227 1228 vmovdqa \XMM1, \T2 1229 vmovdqa \XMM2, TMP2(%rsp) 1230 vmovdqa \XMM3, TMP3(%rsp) 1231 vmovdqa \XMM4, TMP4(%rsp) 1232 vmovdqa \XMM5, TMP5(%rsp) 1233 vmovdqa \XMM6, TMP6(%rsp) 1234 vmovdqa \XMM7, TMP7(%rsp) 1235 vmovdqa \XMM8, TMP8(%rsp) 1236 1237.if \loop_idx == in_order 1238 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1239 vpaddd ONE(%rip), \XMM1, \XMM2 1240 vpaddd ONE(%rip), \XMM2, \XMM3 1241 vpaddd ONE(%rip), \XMM3, \XMM4 1242 vpaddd ONE(%rip), \XMM4, \XMM5 1243 vpaddd ONE(%rip), \XMM5, \XMM6 1244 vpaddd ONE(%rip), \XMM6, \XMM7 1245 vpaddd ONE(%rip), \XMM7, \XMM8 1246 vmovdqa \XMM8, \CTR 1247 1248 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1249 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1250 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1251 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1252 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1253 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1254 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1255 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1256.else 1257 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1258 vpaddd ONEf(%rip), \XMM1, \XMM2 1259 vpaddd ONEf(%rip), \XMM2, \XMM3 1260 vpaddd ONEf(%rip), \XMM3, \XMM4 1261 vpaddd ONEf(%rip), \XMM4, \XMM5 1262 vpaddd ONEf(%rip), \XMM5, \XMM6 1263 vpaddd ONEf(%rip), \XMM6, \XMM7 1264 vpaddd ONEf(%rip), \XMM7, \XMM8 1265 vmovdqa \XMM8, \CTR 1266.endif 1267 1268 1269 ####################################################################### 1270 1271 vmovdqu (arg1), \T1 1272 vpxor \T1, \XMM1, \XMM1 1273 vpxor \T1, \XMM2, \XMM2 1274 vpxor \T1, \XMM3, \XMM3 1275 vpxor \T1, \XMM4, \XMM4 1276 vpxor \T1, \XMM5, \XMM5 1277 vpxor \T1, \XMM6, \XMM6 1278 vpxor \T1, \XMM7, \XMM7 1279 vpxor \T1, \XMM8, \XMM8 1280 1281 ####################################################################### 1282 1283 1284 1285 1286 1287 vmovdqu 16*1(arg1), \T1 1288 vaesenc \T1, \XMM1, \XMM1 1289 vaesenc \T1, \XMM2, \XMM2 1290 vaesenc \T1, \XMM3, \XMM3 1291 vaesenc \T1, \XMM4, \XMM4 1292 vaesenc \T1, \XMM5, \XMM5 1293 vaesenc \T1, \XMM6, \XMM6 1294 vaesenc \T1, \XMM7, \XMM7 1295 vaesenc \T1, \XMM8, \XMM8 1296 1297 vmovdqu 16*2(arg1), \T1 1298 vaesenc \T1, \XMM1, \XMM1 1299 vaesenc \T1, \XMM2, \XMM2 1300 vaesenc \T1, \XMM3, \XMM3 1301 vaesenc \T1, \XMM4, \XMM4 1302 vaesenc \T1, \XMM5, \XMM5 1303 vaesenc \T1, \XMM6, \XMM6 1304 vaesenc \T1, \XMM7, \XMM7 1305 vaesenc \T1, \XMM8, \XMM8 1306 1307 1308 ####################################################################### 1309 1310 vmovdqu HashKey_8(arg2), \T5 1311 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1312 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1313 1314 vpshufd $0b01001110, \T2, \T6 1315 vpxor \T2, \T6, \T6 1316 1317 vmovdqu HashKey_8_k(arg2), \T5 1318 vpclmulqdq $0x00, \T5, \T6, \T6 1319 1320 vmovdqu 16*3(arg1), \T1 1321 vaesenc \T1, \XMM1, \XMM1 1322 vaesenc \T1, \XMM2, \XMM2 1323 vaesenc \T1, \XMM3, \XMM3 1324 vaesenc \T1, \XMM4, \XMM4 1325 vaesenc \T1, \XMM5, \XMM5 1326 vaesenc \T1, \XMM6, \XMM6 1327 vaesenc \T1, \XMM7, \XMM7 1328 vaesenc \T1, \XMM8, \XMM8 1329 1330 vmovdqa TMP2(%rsp), \T1 1331 vmovdqu HashKey_7(arg2), \T5 1332 vpclmulqdq $0x11, \T5, \T1, \T3 1333 vpxor \T3, \T4, \T4 1334 vpclmulqdq $0x00, \T5, \T1, \T3 1335 vpxor \T3, \T7, \T7 1336 1337 vpshufd $0b01001110, \T1, \T3 1338 vpxor \T1, \T3, \T3 1339 vmovdqu HashKey_7_k(arg2), \T5 1340 vpclmulqdq $0x10, \T5, \T3, \T3 1341 vpxor \T3, \T6, \T6 1342 1343 vmovdqu 16*4(arg1), \T1 1344 vaesenc \T1, \XMM1, \XMM1 1345 vaesenc \T1, \XMM2, \XMM2 1346 vaesenc \T1, \XMM3, \XMM3 1347 vaesenc \T1, \XMM4, \XMM4 1348 vaesenc \T1, \XMM5, \XMM5 1349 vaesenc \T1, \XMM6, \XMM6 1350 vaesenc \T1, \XMM7, \XMM7 1351 vaesenc \T1, \XMM8, \XMM8 1352 1353 ####################################################################### 1354 1355 vmovdqa TMP3(%rsp), \T1 1356 vmovdqu HashKey_6(arg2), \T5 1357 vpclmulqdq $0x11, \T5, \T1, \T3 1358 vpxor \T3, \T4, \T4 1359 vpclmulqdq $0x00, \T5, \T1, \T3 1360 vpxor \T3, \T7, \T7 1361 1362 vpshufd $0b01001110, \T1, \T3 1363 vpxor \T1, \T3, \T3 1364 vmovdqu HashKey_6_k(arg2), \T5 1365 vpclmulqdq $0x10, \T5, \T3, \T3 1366 vpxor \T3, \T6, \T6 1367 1368 vmovdqu 16*5(arg1), \T1 1369 vaesenc \T1, \XMM1, \XMM1 1370 vaesenc \T1, \XMM2, \XMM2 1371 vaesenc \T1, \XMM3, \XMM3 1372 vaesenc \T1, \XMM4, \XMM4 1373 vaesenc \T1, \XMM5, \XMM5 1374 vaesenc \T1, \XMM6, \XMM6 1375 vaesenc \T1, \XMM7, \XMM7 1376 vaesenc \T1, \XMM8, \XMM8 1377 1378 vmovdqa TMP4(%rsp), \T1 1379 vmovdqu HashKey_5(arg2), \T5 1380 vpclmulqdq $0x11, \T5, \T1, \T3 1381 vpxor \T3, \T4, \T4 1382 vpclmulqdq $0x00, \T5, \T1, \T3 1383 vpxor \T3, \T7, \T7 1384 1385 vpshufd $0b01001110, \T1, \T3 1386 vpxor \T1, \T3, \T3 1387 vmovdqu HashKey_5_k(arg2), \T5 1388 vpclmulqdq $0x10, \T5, \T3, \T3 1389 vpxor \T3, \T6, \T6 1390 1391 vmovdqu 16*6(arg1), \T1 1392 vaesenc \T1, \XMM1, \XMM1 1393 vaesenc \T1, \XMM2, \XMM2 1394 vaesenc \T1, \XMM3, \XMM3 1395 vaesenc \T1, \XMM4, \XMM4 1396 vaesenc \T1, \XMM5, \XMM5 1397 vaesenc \T1, \XMM6, \XMM6 1398 vaesenc \T1, \XMM7, \XMM7 1399 vaesenc \T1, \XMM8, \XMM8 1400 1401 1402 vmovdqa TMP5(%rsp), \T1 1403 vmovdqu HashKey_4(arg2), \T5 1404 vpclmulqdq $0x11, \T5, \T1, \T3 1405 vpxor \T3, \T4, \T4 1406 vpclmulqdq $0x00, \T5, \T1, \T3 1407 vpxor \T3, \T7, \T7 1408 1409 vpshufd $0b01001110, \T1, \T3 1410 vpxor \T1, \T3, \T3 1411 vmovdqu HashKey_4_k(arg2), \T5 1412 vpclmulqdq $0x10, \T5, \T3, \T3 1413 vpxor \T3, \T6, \T6 1414 1415 vmovdqu 16*7(arg1), \T1 1416 vaesenc \T1, \XMM1, \XMM1 1417 vaesenc \T1, \XMM2, \XMM2 1418 vaesenc \T1, \XMM3, \XMM3 1419 vaesenc \T1, \XMM4, \XMM4 1420 vaesenc \T1, \XMM5, \XMM5 1421 vaesenc \T1, \XMM6, \XMM6 1422 vaesenc \T1, \XMM7, \XMM7 1423 vaesenc \T1, \XMM8, \XMM8 1424 1425 vmovdqa TMP6(%rsp), \T1 1426 vmovdqu HashKey_3(arg2), \T5 1427 vpclmulqdq $0x11, \T5, \T1, \T3 1428 vpxor \T3, \T4, \T4 1429 vpclmulqdq $0x00, \T5, \T1, \T3 1430 vpxor \T3, \T7, \T7 1431 1432 vpshufd $0b01001110, \T1, \T3 1433 vpxor \T1, \T3, \T3 1434 vmovdqu HashKey_3_k(arg2), \T5 1435 vpclmulqdq $0x10, \T5, \T3, \T3 1436 vpxor \T3, \T6, \T6 1437 1438 1439 vmovdqu 16*8(arg1), \T1 1440 vaesenc \T1, \XMM1, \XMM1 1441 vaesenc \T1, \XMM2, \XMM2 1442 vaesenc \T1, \XMM3, \XMM3 1443 vaesenc \T1, \XMM4, \XMM4 1444 vaesenc \T1, \XMM5, \XMM5 1445 vaesenc \T1, \XMM6, \XMM6 1446 vaesenc \T1, \XMM7, \XMM7 1447 vaesenc \T1, \XMM8, \XMM8 1448 1449 vmovdqa TMP7(%rsp), \T1 1450 vmovdqu HashKey_2(arg2), \T5 1451 vpclmulqdq $0x11, \T5, \T1, \T3 1452 vpxor \T3, \T4, \T4 1453 vpclmulqdq $0x00, \T5, \T1, \T3 1454 vpxor \T3, \T7, \T7 1455 1456 vpshufd $0b01001110, \T1, \T3 1457 vpxor \T1, \T3, \T3 1458 vmovdqu HashKey_2_k(arg2), \T5 1459 vpclmulqdq $0x10, \T5, \T3, \T3 1460 vpxor \T3, \T6, \T6 1461 1462 ####################################################################### 1463 1464 vmovdqu 16*9(arg1), \T5 1465 vaesenc \T5, \XMM1, \XMM1 1466 vaesenc \T5, \XMM2, \XMM2 1467 vaesenc \T5, \XMM3, \XMM3 1468 vaesenc \T5, \XMM4, \XMM4 1469 vaesenc \T5, \XMM5, \XMM5 1470 vaesenc \T5, \XMM6, \XMM6 1471 vaesenc \T5, \XMM7, \XMM7 1472 vaesenc \T5, \XMM8, \XMM8 1473 1474 vmovdqa TMP8(%rsp), \T1 1475 vmovdqu HashKey(arg2), \T5 1476 vpclmulqdq $0x11, \T5, \T1, \T3 1477 vpxor \T3, \T4, \T4 1478 vpclmulqdq $0x00, \T5, \T1, \T3 1479 vpxor \T3, \T7, \T7 1480 1481 vpshufd $0b01001110, \T1, \T3 1482 vpxor \T1, \T3, \T3 1483 vmovdqu HashKey_k(arg2), \T5 1484 vpclmulqdq $0x10, \T5, \T3, \T3 1485 vpxor \T3, \T6, \T6 1486 1487 vpxor \T4, \T6, \T6 1488 vpxor \T7, \T6, \T6 1489 1490 vmovdqu 16*10(arg1), \T5 1491 1492 i = 11 1493 setreg 1494.rep (\REP-9) 1495 1496 vaesenc \T5, \XMM1, \XMM1 1497 vaesenc \T5, \XMM2, \XMM2 1498 vaesenc \T5, \XMM3, \XMM3 1499 vaesenc \T5, \XMM4, \XMM4 1500 vaesenc \T5, \XMM5, \XMM5 1501 vaesenc \T5, \XMM6, \XMM6 1502 vaesenc \T5, \XMM7, \XMM7 1503 vaesenc \T5, \XMM8, \XMM8 1504 1505 vmovdqu 16*i(arg1), \T5 1506 i = i + 1 1507 setreg 1508.endr 1509 1510 i = 0 1511 j = 1 1512 setreg 1513.rep 8 1514 vpxor 16*i(arg4, %r11), \T5, \T2 1515 .if \ENC_DEC == ENC 1516 vaesenclast \T2, reg_j, reg_j 1517 .else 1518 vaesenclast \T2, reg_j, \T3 1519 vmovdqu 16*i(arg4, %r11), reg_j 1520 vmovdqu \T3, 16*i(arg3, %r11) 1521 .endif 1522 i = (i+1) 1523 j = (j+1) 1524 setreg 1525.endr 1526 ####################################################################### 1527 1528 1529 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1530 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1531 vpxor \T3, \T7, \T7 1532 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1533 1534 1535 1536 ####################################################################### 1537 #first phase of the reduction 1538 ####################################################################### 1539 vpslld $31, \T7, \T2 # packed right shifting << 31 1540 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1541 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1542 1543 vpxor \T3, \T2, \T2 # xor the shifted versions 1544 vpxor \T4, \T2, \T2 1545 1546 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1547 1548 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1549 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1550 ####################################################################### 1551 .if \ENC_DEC == ENC 1552 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1553 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1554 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1555 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1556 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1557 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1558 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1559 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1560 .endif 1561 1562 ####################################################################### 1563 #second phase of the reduction 1564 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1565 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1566 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1567 vpxor \T3, \T2, \T2 # xor the shifted versions 1568 vpxor \T4, \T2, \T2 1569 1570 vpxor \T1, \T2, \T2 1571 vpxor \T2, \T7, \T7 1572 vpxor \T7, \T6, \T6 # the result is in T6 1573 ####################################################################### 1574 1575 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1576 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1577 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1578 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1579 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1580 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1581 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1582 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1583 1584 1585 vpxor \T6, \XMM1, \XMM1 1586 1587 1588 1589.endm 1590 1591 1592# GHASH the last 4 ciphertext blocks. 1593.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1594 1595 ## Karatsuba Method 1596 1597 1598 vpshufd $0b01001110, \XMM1, \T2 1599 vpxor \XMM1, \T2, \T2 1600 vmovdqu HashKey_8(arg2), \T5 1601 vpclmulqdq $0x11, \T5, \XMM1, \T6 1602 vpclmulqdq $0x00, \T5, \XMM1, \T7 1603 1604 vmovdqu HashKey_8_k(arg2), \T3 1605 vpclmulqdq $0x00, \T3, \T2, \XMM1 1606 1607 ###################### 1608 1609 vpshufd $0b01001110, \XMM2, \T2 1610 vpxor \XMM2, \T2, \T2 1611 vmovdqu HashKey_7(arg2), \T5 1612 vpclmulqdq $0x11, \T5, \XMM2, \T4 1613 vpxor \T4, \T6, \T6 1614 1615 vpclmulqdq $0x00, \T5, \XMM2, \T4 1616 vpxor \T4, \T7, \T7 1617 1618 vmovdqu HashKey_7_k(arg2), \T3 1619 vpclmulqdq $0x00, \T3, \T2, \T2 1620 vpxor \T2, \XMM1, \XMM1 1621 1622 ###################### 1623 1624 vpshufd $0b01001110, \XMM3, \T2 1625 vpxor \XMM3, \T2, \T2 1626 vmovdqu HashKey_6(arg2), \T5 1627 vpclmulqdq $0x11, \T5, \XMM3, \T4 1628 vpxor \T4, \T6, \T6 1629 1630 vpclmulqdq $0x00, \T5, \XMM3, \T4 1631 vpxor \T4, \T7, \T7 1632 1633 vmovdqu HashKey_6_k(arg2), \T3 1634 vpclmulqdq $0x00, \T3, \T2, \T2 1635 vpxor \T2, \XMM1, \XMM1 1636 1637 ###################### 1638 1639 vpshufd $0b01001110, \XMM4, \T2 1640 vpxor \XMM4, \T2, \T2 1641 vmovdqu HashKey_5(arg2), \T5 1642 vpclmulqdq $0x11, \T5, \XMM4, \T4 1643 vpxor \T4, \T6, \T6 1644 1645 vpclmulqdq $0x00, \T5, \XMM4, \T4 1646 vpxor \T4, \T7, \T7 1647 1648 vmovdqu HashKey_5_k(arg2), \T3 1649 vpclmulqdq $0x00, \T3, \T2, \T2 1650 vpxor \T2, \XMM1, \XMM1 1651 1652 ###################### 1653 1654 vpshufd $0b01001110, \XMM5, \T2 1655 vpxor \XMM5, \T2, \T2 1656 vmovdqu HashKey_4(arg2), \T5 1657 vpclmulqdq $0x11, \T5, \XMM5, \T4 1658 vpxor \T4, \T6, \T6 1659 1660 vpclmulqdq $0x00, \T5, \XMM5, \T4 1661 vpxor \T4, \T7, \T7 1662 1663 vmovdqu HashKey_4_k(arg2), \T3 1664 vpclmulqdq $0x00, \T3, \T2, \T2 1665 vpxor \T2, \XMM1, \XMM1 1666 1667 ###################### 1668 1669 vpshufd $0b01001110, \XMM6, \T2 1670 vpxor \XMM6, \T2, \T2 1671 vmovdqu HashKey_3(arg2), \T5 1672 vpclmulqdq $0x11, \T5, \XMM6, \T4 1673 vpxor \T4, \T6, \T6 1674 1675 vpclmulqdq $0x00, \T5, \XMM6, \T4 1676 vpxor \T4, \T7, \T7 1677 1678 vmovdqu HashKey_3_k(arg2), \T3 1679 vpclmulqdq $0x00, \T3, \T2, \T2 1680 vpxor \T2, \XMM1, \XMM1 1681 1682 ###################### 1683 1684 vpshufd $0b01001110, \XMM7, \T2 1685 vpxor \XMM7, \T2, \T2 1686 vmovdqu HashKey_2(arg2), \T5 1687 vpclmulqdq $0x11, \T5, \XMM7, \T4 1688 vpxor \T4, \T6, \T6 1689 1690 vpclmulqdq $0x00, \T5, \XMM7, \T4 1691 vpxor \T4, \T7, \T7 1692 1693 vmovdqu HashKey_2_k(arg2), \T3 1694 vpclmulqdq $0x00, \T3, \T2, \T2 1695 vpxor \T2, \XMM1, \XMM1 1696 1697 ###################### 1698 1699 vpshufd $0b01001110, \XMM8, \T2 1700 vpxor \XMM8, \T2, \T2 1701 vmovdqu HashKey(arg2), \T5 1702 vpclmulqdq $0x11, \T5, \XMM8, \T4 1703 vpxor \T4, \T6, \T6 1704 1705 vpclmulqdq $0x00, \T5, \XMM8, \T4 1706 vpxor \T4, \T7, \T7 1707 1708 vmovdqu HashKey_k(arg2), \T3 1709 vpclmulqdq $0x00, \T3, \T2, \T2 1710 1711 vpxor \T2, \XMM1, \XMM1 1712 vpxor \T6, \XMM1, \XMM1 1713 vpxor \T7, \XMM1, \T2 1714 1715 1716 1717 1718 vpslldq $8, \T2, \T4 1719 vpsrldq $8, \T2, \T2 1720 1721 vpxor \T4, \T7, \T7 1722 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1723 # the accumulated carry-less multiplications 1724 1725 ####################################################################### 1726 #first phase of the reduction 1727 vpslld $31, \T7, \T2 # packed right shifting << 31 1728 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1729 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1730 1731 vpxor \T3, \T2, \T2 # xor the shifted versions 1732 vpxor \T4, \T2, \T2 1733 1734 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1735 1736 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1737 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1738 ####################################################################### 1739 1740 1741 #second phase of the reduction 1742 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1743 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1744 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1745 vpxor \T3, \T2, \T2 # xor the shifted versions 1746 vpxor \T4, \T2, \T2 1747 1748 vpxor \T1, \T2, \T2 1749 vpxor \T2, \T7, \T7 1750 vpxor \T7, \T6, \T6 # the result is in T6 1751 1752.endm 1753 1754############################################################# 1755#void aesni_gcm_precomp_avx_gen2 1756# (gcm_data *my_ctx_data, 1757# gcm_context_data *data, 1758# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1759# u8 *iv, /* Pre-counter block j0: 4 byte salt 1760# (from Security Association) concatenated with 8 byte 1761# Initialisation Vector (from IPSec ESP Payload) 1762# concatenated with 0x00000001. 16-byte aligned pointer. */ 1763# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1764# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1765############################################################# 1766SYM_FUNC_START(aesni_gcm_init_avx_gen2) 1767 FUNC_SAVE 1768 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1769 FUNC_RESTORE 1770 RET 1771SYM_FUNC_END(aesni_gcm_init_avx_gen2) 1772 1773############################################################################### 1774#void aesni_gcm_enc_update_avx_gen2( 1775# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1776# gcm_context_data *data, 1777# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1778# const u8 *in, /* Plaintext input */ 1779# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1780############################################################################### 1781SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) 1782 FUNC_SAVE 1783 mov keysize, %eax 1784 cmp $32, %eax 1785 je key_256_enc_update 1786 cmp $16, %eax 1787 je key_128_enc_update 1788 # must be 192 1789 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1790 FUNC_RESTORE 1791 RET 1792key_128_enc_update: 1793 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1794 FUNC_RESTORE 1795 RET 1796key_256_enc_update: 1797 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1798 FUNC_RESTORE 1799 RET 1800SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) 1801 1802############################################################################### 1803#void aesni_gcm_dec_update_avx_gen2( 1804# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1805# gcm_context_data *data, 1806# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1807# const u8 *in, /* Ciphertext input */ 1808# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1809############################################################################### 1810SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) 1811 FUNC_SAVE 1812 mov keysize,%eax 1813 cmp $32, %eax 1814 je key_256_dec_update 1815 cmp $16, %eax 1816 je key_128_dec_update 1817 # must be 192 1818 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1819 FUNC_RESTORE 1820 RET 1821key_128_dec_update: 1822 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1823 FUNC_RESTORE 1824 RET 1825key_256_dec_update: 1826 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1827 FUNC_RESTORE 1828 RET 1829SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) 1830 1831############################################################################### 1832#void aesni_gcm_finalize_avx_gen2( 1833# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1834# gcm_context_data *data, 1835# u8 *auth_tag, /* Authenticated Tag output. */ 1836# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1837# Valid values are 16 (most likely), 12 or 8. */ 1838############################################################################### 1839SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) 1840 FUNC_SAVE 1841 mov keysize,%eax 1842 cmp $32, %eax 1843 je key_256_finalize 1844 cmp $16, %eax 1845 je key_128_finalize 1846 # must be 192 1847 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1848 FUNC_RESTORE 1849 RET 1850key_128_finalize: 1851 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1852 FUNC_RESTORE 1853 RET 1854key_256_finalize: 1855 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1856 FUNC_RESTORE 1857 RET 1858SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) 1859 1860############################################################################### 1861# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1862# Input: A and B (128-bits each, bit-reflected) 1863# Output: C = A*B*x mod poly, (i.e. >>1 ) 1864# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1865# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1866############################################################################### 1867.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1868 1869 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1870 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1871 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1872 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1873 vpxor \T3, \GH, \GH 1874 1875 1876 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1877 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1878 1879 vpxor \T3, \T1, \T1 1880 vpxor \T2, \GH, \GH 1881 1882 ####################################################################### 1883 #first phase of the reduction 1884 vmovdqa POLY2(%rip), \T3 1885 1886 vpclmulqdq $0x01, \GH, \T3, \T2 1887 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1888 1889 vpxor \T2, \GH, \GH # first phase of the reduction complete 1890 ####################################################################### 1891 #second phase of the reduction 1892 vpclmulqdq $0x00, \GH, \T3, \T2 1893 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1894 1895 vpclmulqdq $0x10, \GH, \T3, \GH 1896 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1897 1898 vpxor \T2, \GH, \GH # second phase of the reduction complete 1899 ####################################################################### 1900 vpxor \T1, \GH, \GH # the result is in GH 1901 1902 1903.endm 1904 1905.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1906 1907 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1908 vmovdqa \HK, \T5 1909 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1910 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1911 1912 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1913 vmovdqu \T5, HashKey_3(arg2) 1914 1915 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1916 vmovdqu \T5, HashKey_4(arg2) 1917 1918 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1919 vmovdqu \T5, HashKey_5(arg2) 1920 1921 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1922 vmovdqu \T5, HashKey_6(arg2) 1923 1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1925 vmovdqu \T5, HashKey_7(arg2) 1926 1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1928 vmovdqu \T5, HashKey_8(arg2) 1929 1930.endm 1931 1932## if a = number of total plaintext bytes 1933## b = floor(a/16) 1934## num_initial_blocks = b mod 4# 1935## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1936## r10, r11, r12, rax are clobbered 1937## arg1, arg2, arg3, arg4 are used as pointers only, not modified 1938 1939.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1940 i = (8-\num_initial_blocks) 1941 setreg 1942 vmovdqu AadHash(arg2), reg_i 1943 1944 # start AES for num_initial_blocks blocks 1945 vmovdqu CurCount(arg2), \CTR 1946 1947 i = (9-\num_initial_blocks) 1948 setreg 1949.rep \num_initial_blocks 1950 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1951 vmovdqa \CTR, reg_i 1952 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1953 i = (i+1) 1954 setreg 1955.endr 1956 1957 vmovdqa (arg1), \T_key 1958 i = (9-\num_initial_blocks) 1959 setreg 1960.rep \num_initial_blocks 1961 vpxor \T_key, reg_i, reg_i 1962 i = (i+1) 1963 setreg 1964.endr 1965 1966 j = 1 1967 setreg 1968.rep \REP 1969 vmovdqa 16*j(arg1), \T_key 1970 i = (9-\num_initial_blocks) 1971 setreg 1972.rep \num_initial_blocks 1973 vaesenc \T_key, reg_i, reg_i 1974 i = (i+1) 1975 setreg 1976.endr 1977 1978 j = (j+1) 1979 setreg 1980.endr 1981 1982 1983 vmovdqa 16*j(arg1), \T_key 1984 i = (9-\num_initial_blocks) 1985 setreg 1986.rep \num_initial_blocks 1987 vaesenclast \T_key, reg_i, reg_i 1988 i = (i+1) 1989 setreg 1990.endr 1991 1992 i = (9-\num_initial_blocks) 1993 setreg 1994.rep \num_initial_blocks 1995 vmovdqu (arg4, %r11), \T1 1996 vpxor \T1, reg_i, reg_i 1997 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 1998 # num_initial_blocks blocks 1999 add $16, %r11 2000.if \ENC_DEC == DEC 2001 vmovdqa \T1, reg_i 2002.endif 2003 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 2004 i = (i+1) 2005 setreg 2006.endr 2007 2008 2009 i = (8-\num_initial_blocks) 2010 j = (9-\num_initial_blocks) 2011 setreg 2012 2013.rep \num_initial_blocks 2014 vpxor reg_i, reg_j, reg_j 2015 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 2016 i = (i+1) 2017 j = (j+1) 2018 setreg 2019.endr 2020 # XMM8 has the combined result here 2021 2022 vmovdqa \XMM8, TMP1(%rsp) 2023 vmovdqa \XMM8, \T3 2024 2025 cmp $128, %r13 2026 jl _initial_blocks_done\@ # no need for precomputed constants 2027 2028############################################################################### 2029# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2030 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2031 vmovdqa \CTR, \XMM1 2032 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2033 2034 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2035 vmovdqa \CTR, \XMM2 2036 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2037 2038 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2039 vmovdqa \CTR, \XMM3 2040 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2041 2042 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2043 vmovdqa \CTR, \XMM4 2044 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2045 2046 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2047 vmovdqa \CTR, \XMM5 2048 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2049 2050 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2051 vmovdqa \CTR, \XMM6 2052 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2053 2054 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2055 vmovdqa \CTR, \XMM7 2056 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2057 2058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2059 vmovdqa \CTR, \XMM8 2060 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2061 2062 vmovdqa (arg1), \T_key 2063 vpxor \T_key, \XMM1, \XMM1 2064 vpxor \T_key, \XMM2, \XMM2 2065 vpxor \T_key, \XMM3, \XMM3 2066 vpxor \T_key, \XMM4, \XMM4 2067 vpxor \T_key, \XMM5, \XMM5 2068 vpxor \T_key, \XMM6, \XMM6 2069 vpxor \T_key, \XMM7, \XMM7 2070 vpxor \T_key, \XMM8, \XMM8 2071 2072 i = 1 2073 setreg 2074.rep \REP # do REP rounds 2075 vmovdqa 16*i(arg1), \T_key 2076 vaesenc \T_key, \XMM1, \XMM1 2077 vaesenc \T_key, \XMM2, \XMM2 2078 vaesenc \T_key, \XMM3, \XMM3 2079 vaesenc \T_key, \XMM4, \XMM4 2080 vaesenc \T_key, \XMM5, \XMM5 2081 vaesenc \T_key, \XMM6, \XMM6 2082 vaesenc \T_key, \XMM7, \XMM7 2083 vaesenc \T_key, \XMM8, \XMM8 2084 i = (i+1) 2085 setreg 2086.endr 2087 2088 2089 vmovdqa 16*i(arg1), \T_key 2090 vaesenclast \T_key, \XMM1, \XMM1 2091 vaesenclast \T_key, \XMM2, \XMM2 2092 vaesenclast \T_key, \XMM3, \XMM3 2093 vaesenclast \T_key, \XMM4, \XMM4 2094 vaesenclast \T_key, \XMM5, \XMM5 2095 vaesenclast \T_key, \XMM6, \XMM6 2096 vaesenclast \T_key, \XMM7, \XMM7 2097 vaesenclast \T_key, \XMM8, \XMM8 2098 2099 vmovdqu (arg4, %r11), \T1 2100 vpxor \T1, \XMM1, \XMM1 2101 vmovdqu \XMM1, (arg3 , %r11) 2102 .if \ENC_DEC == DEC 2103 vmovdqa \T1, \XMM1 2104 .endif 2105 2106 vmovdqu 16*1(arg4, %r11), \T1 2107 vpxor \T1, \XMM2, \XMM2 2108 vmovdqu \XMM2, 16*1(arg3 , %r11) 2109 .if \ENC_DEC == DEC 2110 vmovdqa \T1, \XMM2 2111 .endif 2112 2113 vmovdqu 16*2(arg4, %r11), \T1 2114 vpxor \T1, \XMM3, \XMM3 2115 vmovdqu \XMM3, 16*2(arg3 , %r11) 2116 .if \ENC_DEC == DEC 2117 vmovdqa \T1, \XMM3 2118 .endif 2119 2120 vmovdqu 16*3(arg4, %r11), \T1 2121 vpxor \T1, \XMM4, \XMM4 2122 vmovdqu \XMM4, 16*3(arg3 , %r11) 2123 .if \ENC_DEC == DEC 2124 vmovdqa \T1, \XMM4 2125 .endif 2126 2127 vmovdqu 16*4(arg4, %r11), \T1 2128 vpxor \T1, \XMM5, \XMM5 2129 vmovdqu \XMM5, 16*4(arg3 , %r11) 2130 .if \ENC_DEC == DEC 2131 vmovdqa \T1, \XMM5 2132 .endif 2133 2134 vmovdqu 16*5(arg4, %r11), \T1 2135 vpxor \T1, \XMM6, \XMM6 2136 vmovdqu \XMM6, 16*5(arg3 , %r11) 2137 .if \ENC_DEC == DEC 2138 vmovdqa \T1, \XMM6 2139 .endif 2140 2141 vmovdqu 16*6(arg4, %r11), \T1 2142 vpxor \T1, \XMM7, \XMM7 2143 vmovdqu \XMM7, 16*6(arg3 , %r11) 2144 .if \ENC_DEC == DEC 2145 vmovdqa \T1, \XMM7 2146 .endif 2147 2148 vmovdqu 16*7(arg4, %r11), \T1 2149 vpxor \T1, \XMM8, \XMM8 2150 vmovdqu \XMM8, 16*7(arg3 , %r11) 2151 .if \ENC_DEC == DEC 2152 vmovdqa \T1, \XMM8 2153 .endif 2154 2155 add $128, %r11 2156 2157 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2158 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2159 # the corresponding ciphertext 2160 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2161 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2162 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2163 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2164 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2165 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2166 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2167 2168############################################################################### 2169 2170_initial_blocks_done\@: 2171 2172 2173.endm 2174 2175 2176 2177# encrypt 8 blocks at a time 2178# ghash the 8 previously encrypted ciphertext blocks 2179# arg1, arg2, arg3, arg4 are used as pointers only, not modified 2180# r11 is the data offset value 2181.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2182 2183 vmovdqa \XMM1, \T2 2184 vmovdqa \XMM2, TMP2(%rsp) 2185 vmovdqa \XMM3, TMP3(%rsp) 2186 vmovdqa \XMM4, TMP4(%rsp) 2187 vmovdqa \XMM5, TMP5(%rsp) 2188 vmovdqa \XMM6, TMP6(%rsp) 2189 vmovdqa \XMM7, TMP7(%rsp) 2190 vmovdqa \XMM8, TMP8(%rsp) 2191 2192.if \loop_idx == in_order 2193 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2194 vpaddd ONE(%rip), \XMM1, \XMM2 2195 vpaddd ONE(%rip), \XMM2, \XMM3 2196 vpaddd ONE(%rip), \XMM3, \XMM4 2197 vpaddd ONE(%rip), \XMM4, \XMM5 2198 vpaddd ONE(%rip), \XMM5, \XMM6 2199 vpaddd ONE(%rip), \XMM6, \XMM7 2200 vpaddd ONE(%rip), \XMM7, \XMM8 2201 vmovdqa \XMM8, \CTR 2202 2203 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2204 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2205 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2206 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2207 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2208 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2209 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2210 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2211.else 2212 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2213 vpaddd ONEf(%rip), \XMM1, \XMM2 2214 vpaddd ONEf(%rip), \XMM2, \XMM3 2215 vpaddd ONEf(%rip), \XMM3, \XMM4 2216 vpaddd ONEf(%rip), \XMM4, \XMM5 2217 vpaddd ONEf(%rip), \XMM5, \XMM6 2218 vpaddd ONEf(%rip), \XMM6, \XMM7 2219 vpaddd ONEf(%rip), \XMM7, \XMM8 2220 vmovdqa \XMM8, \CTR 2221.endif 2222 2223 2224 ####################################################################### 2225 2226 vmovdqu (arg1), \T1 2227 vpxor \T1, \XMM1, \XMM1 2228 vpxor \T1, \XMM2, \XMM2 2229 vpxor \T1, \XMM3, \XMM3 2230 vpxor \T1, \XMM4, \XMM4 2231 vpxor \T1, \XMM5, \XMM5 2232 vpxor \T1, \XMM6, \XMM6 2233 vpxor \T1, \XMM7, \XMM7 2234 vpxor \T1, \XMM8, \XMM8 2235 2236 ####################################################################### 2237 2238 2239 2240 2241 2242 vmovdqu 16*1(arg1), \T1 2243 vaesenc \T1, \XMM1, \XMM1 2244 vaesenc \T1, \XMM2, \XMM2 2245 vaesenc \T1, \XMM3, \XMM3 2246 vaesenc \T1, \XMM4, \XMM4 2247 vaesenc \T1, \XMM5, \XMM5 2248 vaesenc \T1, \XMM6, \XMM6 2249 vaesenc \T1, \XMM7, \XMM7 2250 vaesenc \T1, \XMM8, \XMM8 2251 2252 vmovdqu 16*2(arg1), \T1 2253 vaesenc \T1, \XMM1, \XMM1 2254 vaesenc \T1, \XMM2, \XMM2 2255 vaesenc \T1, \XMM3, \XMM3 2256 vaesenc \T1, \XMM4, \XMM4 2257 vaesenc \T1, \XMM5, \XMM5 2258 vaesenc \T1, \XMM6, \XMM6 2259 vaesenc \T1, \XMM7, \XMM7 2260 vaesenc \T1, \XMM8, \XMM8 2261 2262 2263 ####################################################################### 2264 2265 vmovdqu HashKey_8(arg2), \T5 2266 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2267 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2268 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2269 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2270 vpxor \T5, \T6, \T6 2271 2272 vmovdqu 16*3(arg1), \T1 2273 vaesenc \T1, \XMM1, \XMM1 2274 vaesenc \T1, \XMM2, \XMM2 2275 vaesenc \T1, \XMM3, \XMM3 2276 vaesenc \T1, \XMM4, \XMM4 2277 vaesenc \T1, \XMM5, \XMM5 2278 vaesenc \T1, \XMM6, \XMM6 2279 vaesenc \T1, \XMM7, \XMM7 2280 vaesenc \T1, \XMM8, \XMM8 2281 2282 vmovdqa TMP2(%rsp), \T1 2283 vmovdqu HashKey_7(arg2), \T5 2284 vpclmulqdq $0x11, \T5, \T1, \T3 2285 vpxor \T3, \T4, \T4 2286 2287 vpclmulqdq $0x00, \T5, \T1, \T3 2288 vpxor \T3, \T7, \T7 2289 2290 vpclmulqdq $0x01, \T5, \T1, \T3 2291 vpxor \T3, \T6, \T6 2292 2293 vpclmulqdq $0x10, \T5, \T1, \T3 2294 vpxor \T3, \T6, \T6 2295 2296 vmovdqu 16*4(arg1), \T1 2297 vaesenc \T1, \XMM1, \XMM1 2298 vaesenc \T1, \XMM2, \XMM2 2299 vaesenc \T1, \XMM3, \XMM3 2300 vaesenc \T1, \XMM4, \XMM4 2301 vaesenc \T1, \XMM5, \XMM5 2302 vaesenc \T1, \XMM6, \XMM6 2303 vaesenc \T1, \XMM7, \XMM7 2304 vaesenc \T1, \XMM8, \XMM8 2305 2306 ####################################################################### 2307 2308 vmovdqa TMP3(%rsp), \T1 2309 vmovdqu HashKey_6(arg2), \T5 2310 vpclmulqdq $0x11, \T5, \T1, \T3 2311 vpxor \T3, \T4, \T4 2312 2313 vpclmulqdq $0x00, \T5, \T1, \T3 2314 vpxor \T3, \T7, \T7 2315 2316 vpclmulqdq $0x01, \T5, \T1, \T3 2317 vpxor \T3, \T6, \T6 2318 2319 vpclmulqdq $0x10, \T5, \T1, \T3 2320 vpxor \T3, \T6, \T6 2321 2322 vmovdqu 16*5(arg1), \T1 2323 vaesenc \T1, \XMM1, \XMM1 2324 vaesenc \T1, \XMM2, \XMM2 2325 vaesenc \T1, \XMM3, \XMM3 2326 vaesenc \T1, \XMM4, \XMM4 2327 vaesenc \T1, \XMM5, \XMM5 2328 vaesenc \T1, \XMM6, \XMM6 2329 vaesenc \T1, \XMM7, \XMM7 2330 vaesenc \T1, \XMM8, \XMM8 2331 2332 vmovdqa TMP4(%rsp), \T1 2333 vmovdqu HashKey_5(arg2), \T5 2334 vpclmulqdq $0x11, \T5, \T1, \T3 2335 vpxor \T3, \T4, \T4 2336 2337 vpclmulqdq $0x00, \T5, \T1, \T3 2338 vpxor \T3, \T7, \T7 2339 2340 vpclmulqdq $0x01, \T5, \T1, \T3 2341 vpxor \T3, \T6, \T6 2342 2343 vpclmulqdq $0x10, \T5, \T1, \T3 2344 vpxor \T3, \T6, \T6 2345 2346 vmovdqu 16*6(arg1), \T1 2347 vaesenc \T1, \XMM1, \XMM1 2348 vaesenc \T1, \XMM2, \XMM2 2349 vaesenc \T1, \XMM3, \XMM3 2350 vaesenc \T1, \XMM4, \XMM4 2351 vaesenc \T1, \XMM5, \XMM5 2352 vaesenc \T1, \XMM6, \XMM6 2353 vaesenc \T1, \XMM7, \XMM7 2354 vaesenc \T1, \XMM8, \XMM8 2355 2356 2357 vmovdqa TMP5(%rsp), \T1 2358 vmovdqu HashKey_4(arg2), \T5 2359 vpclmulqdq $0x11, \T5, \T1, \T3 2360 vpxor \T3, \T4, \T4 2361 2362 vpclmulqdq $0x00, \T5, \T1, \T3 2363 vpxor \T3, \T7, \T7 2364 2365 vpclmulqdq $0x01, \T5, \T1, \T3 2366 vpxor \T3, \T6, \T6 2367 2368 vpclmulqdq $0x10, \T5, \T1, \T3 2369 vpxor \T3, \T6, \T6 2370 2371 vmovdqu 16*7(arg1), \T1 2372 vaesenc \T1, \XMM1, \XMM1 2373 vaesenc \T1, \XMM2, \XMM2 2374 vaesenc \T1, \XMM3, \XMM3 2375 vaesenc \T1, \XMM4, \XMM4 2376 vaesenc \T1, \XMM5, \XMM5 2377 vaesenc \T1, \XMM6, \XMM6 2378 vaesenc \T1, \XMM7, \XMM7 2379 vaesenc \T1, \XMM8, \XMM8 2380 2381 vmovdqa TMP6(%rsp), \T1 2382 vmovdqu HashKey_3(arg2), \T5 2383 vpclmulqdq $0x11, \T5, \T1, \T3 2384 vpxor \T3, \T4, \T4 2385 2386 vpclmulqdq $0x00, \T5, \T1, \T3 2387 vpxor \T3, \T7, \T7 2388 2389 vpclmulqdq $0x01, \T5, \T1, \T3 2390 vpxor \T3, \T6, \T6 2391 2392 vpclmulqdq $0x10, \T5, \T1, \T3 2393 vpxor \T3, \T6, \T6 2394 2395 vmovdqu 16*8(arg1), \T1 2396 vaesenc \T1, \XMM1, \XMM1 2397 vaesenc \T1, \XMM2, \XMM2 2398 vaesenc \T1, \XMM3, \XMM3 2399 vaesenc \T1, \XMM4, \XMM4 2400 vaesenc \T1, \XMM5, \XMM5 2401 vaesenc \T1, \XMM6, \XMM6 2402 vaesenc \T1, \XMM7, \XMM7 2403 vaesenc \T1, \XMM8, \XMM8 2404 2405 vmovdqa TMP7(%rsp), \T1 2406 vmovdqu HashKey_2(arg2), \T5 2407 vpclmulqdq $0x11, \T5, \T1, \T3 2408 vpxor \T3, \T4, \T4 2409 2410 vpclmulqdq $0x00, \T5, \T1, \T3 2411 vpxor \T3, \T7, \T7 2412 2413 vpclmulqdq $0x01, \T5, \T1, \T3 2414 vpxor \T3, \T6, \T6 2415 2416 vpclmulqdq $0x10, \T5, \T1, \T3 2417 vpxor \T3, \T6, \T6 2418 2419 2420 ####################################################################### 2421 2422 vmovdqu 16*9(arg1), \T5 2423 vaesenc \T5, \XMM1, \XMM1 2424 vaesenc \T5, \XMM2, \XMM2 2425 vaesenc \T5, \XMM3, \XMM3 2426 vaesenc \T5, \XMM4, \XMM4 2427 vaesenc \T5, \XMM5, \XMM5 2428 vaesenc \T5, \XMM6, \XMM6 2429 vaesenc \T5, \XMM7, \XMM7 2430 vaesenc \T5, \XMM8, \XMM8 2431 2432 vmovdqa TMP8(%rsp), \T1 2433 vmovdqu HashKey(arg2), \T5 2434 2435 vpclmulqdq $0x00, \T5, \T1, \T3 2436 vpxor \T3, \T7, \T7 2437 2438 vpclmulqdq $0x01, \T5, \T1, \T3 2439 vpxor \T3, \T6, \T6 2440 2441 vpclmulqdq $0x10, \T5, \T1, \T3 2442 vpxor \T3, \T6, \T6 2443 2444 vpclmulqdq $0x11, \T5, \T1, \T3 2445 vpxor \T3, \T4, \T1 2446 2447 2448 vmovdqu 16*10(arg1), \T5 2449 2450 i = 11 2451 setreg 2452.rep (\REP-9) 2453 vaesenc \T5, \XMM1, \XMM1 2454 vaesenc \T5, \XMM2, \XMM2 2455 vaesenc \T5, \XMM3, \XMM3 2456 vaesenc \T5, \XMM4, \XMM4 2457 vaesenc \T5, \XMM5, \XMM5 2458 vaesenc \T5, \XMM6, \XMM6 2459 vaesenc \T5, \XMM7, \XMM7 2460 vaesenc \T5, \XMM8, \XMM8 2461 2462 vmovdqu 16*i(arg1), \T5 2463 i = i + 1 2464 setreg 2465.endr 2466 2467 i = 0 2468 j = 1 2469 setreg 2470.rep 8 2471 vpxor 16*i(arg4, %r11), \T5, \T2 2472 .if \ENC_DEC == ENC 2473 vaesenclast \T2, reg_j, reg_j 2474 .else 2475 vaesenclast \T2, reg_j, \T3 2476 vmovdqu 16*i(arg4, %r11), reg_j 2477 vmovdqu \T3, 16*i(arg3, %r11) 2478 .endif 2479 i = (i+1) 2480 j = (j+1) 2481 setreg 2482.endr 2483 ####################################################################### 2484 2485 2486 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2487 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2488 vpxor \T3, \T7, \T7 2489 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2490 2491 2492 2493 ####################################################################### 2494 #first phase of the reduction 2495 vmovdqa POLY2(%rip), \T3 2496 2497 vpclmulqdq $0x01, \T7, \T3, \T2 2498 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2499 2500 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2501 ####################################################################### 2502 .if \ENC_DEC == ENC 2503 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2504 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2505 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2506 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2507 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2508 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2509 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2510 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2511 .endif 2512 2513 ####################################################################### 2514 #second phase of the reduction 2515 vpclmulqdq $0x00, \T7, \T3, \T2 2516 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2517 2518 vpclmulqdq $0x10, \T7, \T3, \T4 2519 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2520 2521 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2522 ####################################################################### 2523 vpxor \T4, \T1, \T1 # the result is in T1 2524 2525 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2526 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2527 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2528 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2529 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2530 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2531 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2533 2534 2535 vpxor \T1, \XMM1, \XMM1 2536 2537 2538 2539.endm 2540 2541 2542# GHASH the last 4 ciphertext blocks. 2543.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2544 2545 ## Karatsuba Method 2546 2547 vmovdqu HashKey_8(arg2), \T5 2548 2549 vpshufd $0b01001110, \XMM1, \T2 2550 vpshufd $0b01001110, \T5, \T3 2551 vpxor \XMM1, \T2, \T2 2552 vpxor \T5, \T3, \T3 2553 2554 vpclmulqdq $0x11, \T5, \XMM1, \T6 2555 vpclmulqdq $0x00, \T5, \XMM1, \T7 2556 2557 vpclmulqdq $0x00, \T3, \T2, \XMM1 2558 2559 ###################### 2560 2561 vmovdqu HashKey_7(arg2), \T5 2562 vpshufd $0b01001110, \XMM2, \T2 2563 vpshufd $0b01001110, \T5, \T3 2564 vpxor \XMM2, \T2, \T2 2565 vpxor \T5, \T3, \T3 2566 2567 vpclmulqdq $0x11, \T5, \XMM2, \T4 2568 vpxor \T4, \T6, \T6 2569 2570 vpclmulqdq $0x00, \T5, \XMM2, \T4 2571 vpxor \T4, \T7, \T7 2572 2573 vpclmulqdq $0x00, \T3, \T2, \T2 2574 2575 vpxor \T2, \XMM1, \XMM1 2576 2577 ###################### 2578 2579 vmovdqu HashKey_6(arg2), \T5 2580 vpshufd $0b01001110, \XMM3, \T2 2581 vpshufd $0b01001110, \T5, \T3 2582 vpxor \XMM3, \T2, \T2 2583 vpxor \T5, \T3, \T3 2584 2585 vpclmulqdq $0x11, \T5, \XMM3, \T4 2586 vpxor \T4, \T6, \T6 2587 2588 vpclmulqdq $0x00, \T5, \XMM3, \T4 2589 vpxor \T4, \T7, \T7 2590 2591 vpclmulqdq $0x00, \T3, \T2, \T2 2592 2593 vpxor \T2, \XMM1, \XMM1 2594 2595 ###################### 2596 2597 vmovdqu HashKey_5(arg2), \T5 2598 vpshufd $0b01001110, \XMM4, \T2 2599 vpshufd $0b01001110, \T5, \T3 2600 vpxor \XMM4, \T2, \T2 2601 vpxor \T5, \T3, \T3 2602 2603 vpclmulqdq $0x11, \T5, \XMM4, \T4 2604 vpxor \T4, \T6, \T6 2605 2606 vpclmulqdq $0x00, \T5, \XMM4, \T4 2607 vpxor \T4, \T7, \T7 2608 2609 vpclmulqdq $0x00, \T3, \T2, \T2 2610 2611 vpxor \T2, \XMM1, \XMM1 2612 2613 ###################### 2614 2615 vmovdqu HashKey_4(arg2), \T5 2616 vpshufd $0b01001110, \XMM5, \T2 2617 vpshufd $0b01001110, \T5, \T3 2618 vpxor \XMM5, \T2, \T2 2619 vpxor \T5, \T3, \T3 2620 2621 vpclmulqdq $0x11, \T5, \XMM5, \T4 2622 vpxor \T4, \T6, \T6 2623 2624 vpclmulqdq $0x00, \T5, \XMM5, \T4 2625 vpxor \T4, \T7, \T7 2626 2627 vpclmulqdq $0x00, \T3, \T2, \T2 2628 2629 vpxor \T2, \XMM1, \XMM1 2630 2631 ###################### 2632 2633 vmovdqu HashKey_3(arg2), \T5 2634 vpshufd $0b01001110, \XMM6, \T2 2635 vpshufd $0b01001110, \T5, \T3 2636 vpxor \XMM6, \T2, \T2 2637 vpxor \T5, \T3, \T3 2638 2639 vpclmulqdq $0x11, \T5, \XMM6, \T4 2640 vpxor \T4, \T6, \T6 2641 2642 vpclmulqdq $0x00, \T5, \XMM6, \T4 2643 vpxor \T4, \T7, \T7 2644 2645 vpclmulqdq $0x00, \T3, \T2, \T2 2646 2647 vpxor \T2, \XMM1, \XMM1 2648 2649 ###################### 2650 2651 vmovdqu HashKey_2(arg2), \T5 2652 vpshufd $0b01001110, \XMM7, \T2 2653 vpshufd $0b01001110, \T5, \T3 2654 vpxor \XMM7, \T2, \T2 2655 vpxor \T5, \T3, \T3 2656 2657 vpclmulqdq $0x11, \T5, \XMM7, \T4 2658 vpxor \T4, \T6, \T6 2659 2660 vpclmulqdq $0x00, \T5, \XMM7, \T4 2661 vpxor \T4, \T7, \T7 2662 2663 vpclmulqdq $0x00, \T3, \T2, \T2 2664 2665 vpxor \T2, \XMM1, \XMM1 2666 2667 ###################### 2668 2669 vmovdqu HashKey(arg2), \T5 2670 vpshufd $0b01001110, \XMM8, \T2 2671 vpshufd $0b01001110, \T5, \T3 2672 vpxor \XMM8, \T2, \T2 2673 vpxor \T5, \T3, \T3 2674 2675 vpclmulqdq $0x11, \T5, \XMM8, \T4 2676 vpxor \T4, \T6, \T6 2677 2678 vpclmulqdq $0x00, \T5, \XMM8, \T4 2679 vpxor \T4, \T7, \T7 2680 2681 vpclmulqdq $0x00, \T3, \T2, \T2 2682 2683 vpxor \T2, \XMM1, \XMM1 2684 vpxor \T6, \XMM1, \XMM1 2685 vpxor \T7, \XMM1, \T2 2686 2687 2688 2689 2690 vpslldq $8, \T2, \T4 2691 vpsrldq $8, \T2, \T2 2692 2693 vpxor \T4, \T7, \T7 2694 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2695 # accumulated carry-less multiplications 2696 2697 ####################################################################### 2698 #first phase of the reduction 2699 vmovdqa POLY2(%rip), \T3 2700 2701 vpclmulqdq $0x01, \T7, \T3, \T2 2702 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2703 2704 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2705 ####################################################################### 2706 2707 2708 #second phase of the reduction 2709 vpclmulqdq $0x00, \T7, \T3, \T2 2710 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2711 2712 vpclmulqdq $0x10, \T7, \T3, \T4 2713 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2714 2715 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2716 ####################################################################### 2717 vpxor \T4, \T6, \T6 # the result is in T6 2718.endm 2719 2720 2721 2722############################################################# 2723#void aesni_gcm_init_avx_gen4 2724# (gcm_data *my_ctx_data, 2725# gcm_context_data *data, 2726# u8 *iv, /* Pre-counter block j0: 4 byte salt 2727# (from Security Association) concatenated with 8 byte 2728# Initialisation Vector (from IPSec ESP Payload) 2729# concatenated with 0x00000001. 16-byte aligned pointer. */ 2730# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2731# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2732# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2733############################################################# 2734SYM_FUNC_START(aesni_gcm_init_avx_gen4) 2735 FUNC_SAVE 2736 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2737 FUNC_RESTORE 2738 RET 2739SYM_FUNC_END(aesni_gcm_init_avx_gen4) 2740 2741############################################################################### 2742#void aesni_gcm_enc_avx_gen4( 2743# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2744# gcm_context_data *data, 2745# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2746# const u8 *in, /* Plaintext input */ 2747# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2748############################################################################### 2749SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) 2750 FUNC_SAVE 2751 mov keysize,%eax 2752 cmp $32, %eax 2753 je key_256_enc_update4 2754 cmp $16, %eax 2755 je key_128_enc_update4 2756 # must be 192 2757 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2758 FUNC_RESTORE 2759 RET 2760key_128_enc_update4: 2761 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2762 FUNC_RESTORE 2763 RET 2764key_256_enc_update4: 2765 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2766 FUNC_RESTORE 2767 RET 2768SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) 2769 2770############################################################################### 2771#void aesni_gcm_dec_update_avx_gen4( 2772# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2773# gcm_context_data *data, 2774# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2775# const u8 *in, /* Ciphertext input */ 2776# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2777############################################################################### 2778SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) 2779 FUNC_SAVE 2780 mov keysize,%eax 2781 cmp $32, %eax 2782 je key_256_dec_update4 2783 cmp $16, %eax 2784 je key_128_dec_update4 2785 # must be 192 2786 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2787 FUNC_RESTORE 2788 RET 2789key_128_dec_update4: 2790 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2791 FUNC_RESTORE 2792 RET 2793key_256_dec_update4: 2794 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2795 FUNC_RESTORE 2796 RET 2797SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) 2798 2799############################################################################### 2800#void aesni_gcm_finalize_avx_gen4( 2801# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2802# gcm_context_data *data, 2803# u8 *auth_tag, /* Authenticated Tag output. */ 2804# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2805# Valid values are 16 (most likely), 12 or 8. */ 2806############################################################################### 2807SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) 2808 FUNC_SAVE 2809 mov keysize,%eax 2810 cmp $32, %eax 2811 je key_256_finalize4 2812 cmp $16, %eax 2813 je key_128_finalize4 2814 # must be 192 2815 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2816 FUNC_RESTORE 2817 RET 2818key_128_finalize4: 2819 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2820 FUNC_RESTORE 2821 RET 2822key_256_finalize4: 2823 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2824 FUNC_RESTORE 2825 RET 2826SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)