cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aesni-intel_avx-x86_64.S (100008B)


      1########################################################################
      2# Copyright (c) 2013, Intel Corporation
      3#
      4# This software is available to you under a choice of one of two
      5# licenses.  You may choose to be licensed under the terms of the GNU
      6# General Public License (GPL) Version 2, available from the file
      7# COPYING in the main directory of this source tree, or the
      8# OpenIB.org BSD license below:
      9#
     10# Redistribution and use in source and binary forms, with or without
     11# modification, are permitted provided that the following conditions are
     12# met:
     13#
     14# * Redistributions of source code must retain the above copyright
     15#   notice, this list of conditions and the following disclaimer.
     16#
     17# * Redistributions in binary form must reproduce the above copyright
     18#   notice, this list of conditions and the following disclaimer in the
     19#   documentation and/or other materials provided with the
     20#   distribution.
     21#
     22# * Neither the name of the Intel Corporation nor the names of its
     23#   contributors may be used to endorse or promote products derived from
     24#   this software without specific prior written permission.
     25#
     26#
     27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
     28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
     31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
     34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     38########################################################################
     39##
     40## Authors:
     41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
     42##	Vinodh Gopal <vinodh.gopal@intel.com>
     43##	James Guilford <james.guilford@intel.com>
     44##	Tim Chen <tim.c.chen@linux.intel.com>
     45##
     46## References:
     47##       This code was derived and highly optimized from the code described in paper:
     48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
     49##			on Intel Architecture Processors. August, 2010
     50##       The details of the implementation is explained in:
     51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
     52##			on Intel Architecture Processors. October, 2012.
     53##
     54## Assumptions:
     55##
     56##
     57##
     58## iv:
     59##       0                   1                   2                   3
     60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
     61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     62##       |                             Salt  (From the SA)               |
     63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     64##       |                     Initialization Vector                     |
     65##       |         (This is the sequence number from IPSec header)       |
     66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     67##       |                              0x1                              |
     68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     69##
     70##
     71##
     72## AAD:
     73##       AAD padded to 128 bits with 0
     74##       for example, assume AAD is a u32 vector
     75##
     76##       if AAD is 8 bytes:
     77##       AAD[3] = {A0, A1}#
     78##       padded AAD in xmm register = {A1 A0 0 0}
     79##
     80##       0                   1                   2                   3
     81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
     82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     83##       |                               SPI (A1)                        |
     84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     85##       |                     32-bit Sequence Number (A0)               |
     86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     87##       |                              0x0                              |
     88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     89##
     90##                                       AAD Format with 32-bit Sequence Number
     91##
     92##       if AAD is 12 bytes:
     93##       AAD[3] = {A0, A1, A2}#
     94##       padded AAD in xmm register = {A2 A1 A0 0}
     95##
     96##       0                   1                   2                   3
     97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
     98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
     99##       |                               SPI (A2)                        |
    100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    101##       |                 64-bit Extended Sequence Number {A1,A0}       |
    102##       |                                                               |
    103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    104##       |                              0x0                              |
    105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    106##
    107##        AAD Format with 64-bit Extended Sequence Number
    108##
    109##
    110## aadLen:
    111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
    112##	 The code additionally supports aadLen of length 16 bytes.
    113##
    114## TLen:
    115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
    116##
    117## poly = x^128 + x^127 + x^126 + x^121 + 1
    118## throughout the code, one tab and two tab indentations are used. one tab is
    119## for GHASH part, two tabs is for AES part.
    120##
    121
    122#include <linux/linkage.h>
    123
    124# constants in mergeable sections, linker can reorder and merge
    125.section	.rodata.cst16.POLY, "aM", @progbits, 16
    126.align 16
    127POLY:            .octa     0xC2000000000000000000000000000001
    128
    129.section	.rodata.cst16.POLY2, "aM", @progbits, 16
    130.align 16
    131POLY2:           .octa     0xC20000000000000000000001C2000000
    132
    133.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
    134.align 16
    135TWOONE:          .octa     0x00000001000000000000000000000001
    136
    137.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
    138.align 16
    139SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
    140
    141.section	.rodata.cst16.ONE, "aM", @progbits, 16
    142.align 16
    143ONE:             .octa     0x00000000000000000000000000000001
    144
    145.section	.rodata.cst16.ONEf, "aM", @progbits, 16
    146.align 16
    147ONEf:            .octa     0x01000000000000000000000000000000
    148
    149# order of these constants should not change.
    150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
    151.section	.rodata, "a", @progbits
    152.align 16
    153SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
    154ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
    155                 .octa     0x00000000000000000000000000000000
    156
    157.section .rodata
    158.align 16
    159.type aad_shift_arr, @object
    160.size aad_shift_arr, 272
    161aad_shift_arr:
    162        .octa     0xffffffffffffffffffffffffffffffff
    163        .octa     0xffffffffffffffffffffffffffffff0C
    164        .octa     0xffffffffffffffffffffffffffff0D0C
    165        .octa     0xffffffffffffffffffffffffff0E0D0C
    166        .octa     0xffffffffffffffffffffffff0F0E0D0C
    167        .octa     0xffffffffffffffffffffff0C0B0A0908
    168        .octa     0xffffffffffffffffffff0D0C0B0A0908
    169        .octa     0xffffffffffffffffff0E0D0C0B0A0908
    170        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
    171        .octa     0xffffffffffffff0C0B0A090807060504
    172        .octa     0xffffffffffff0D0C0B0A090807060504
    173        .octa     0xffffffffff0E0D0C0B0A090807060504
    174        .octa     0xffffffff0F0E0D0C0B0A090807060504
    175        .octa     0xffffff0C0B0A09080706050403020100
    176        .octa     0xffff0D0C0B0A09080706050403020100
    177        .octa     0xff0E0D0C0B0A09080706050403020100
    178        .octa     0x0F0E0D0C0B0A09080706050403020100
    179
    180
    181.text
    182
    183
    184#define AadHash 16*0
    185#define AadLen 16*1
    186#define InLen (16*1)+8
    187#define PBlockEncKey 16*2
    188#define OrigIV 16*3
    189#define CurCount 16*4
    190#define PBlockLen 16*5
    191
    192HashKey        = 16*6   # store HashKey <<1 mod poly here
    193HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
    194HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
    195HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
    196HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
    197HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
    198HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
    199HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
    200HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
    201HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
    202HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
    203HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
    204HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
    205HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
    206HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
    207HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
    208
    209#define arg1 %rdi
    210#define arg2 %rsi
    211#define arg3 %rdx
    212#define arg4 %rcx
    213#define arg5 %r8
    214#define arg6 %r9
    215#define keysize 2*15*16(arg1)
    216
    217i = 0
    218j = 0
    219
    220out_order = 0
    221in_order = 1
    222DEC = 0
    223ENC = 1
    224
    225.macro define_reg r n
    226reg_\r = %xmm\n
    227.endm
    228
    229.macro setreg
    230.altmacro
    231define_reg i %i
    232define_reg j %j
    233.noaltmacro
    234.endm
    235
    236TMP1 =   16*0    # Temporary storage for AAD
    237TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
    238TMP3 =   16*2    # Temporary storage for AES State 3
    239TMP4 =   16*3    # Temporary storage for AES State 4
    240TMP5 =   16*4    # Temporary storage for AES State 5
    241TMP6 =   16*5    # Temporary storage for AES State 6
    242TMP7 =   16*6    # Temporary storage for AES State 7
    243TMP8 =   16*7    # Temporary storage for AES State 8
    244
    245VARIABLE_OFFSET = 16*8
    246
    247################################
    248# Utility Macros
    249################################
    250
    251.macro FUNC_SAVE
    252        push    %r12
    253        push    %r13
    254        push    %r15
    255
    256	push	%rbp
    257	mov	%rsp, %rbp
    258
    259        sub     $VARIABLE_OFFSET, %rsp
    260        and     $~63, %rsp                    # align rsp to 64 bytes
    261.endm
    262
    263.macro FUNC_RESTORE
    264        mov     %rbp, %rsp
    265	pop	%rbp
    266
    267        pop     %r15
    268        pop     %r13
    269        pop     %r12
    270.endm
    271
    272# Encryption of a single block
    273.macro ENCRYPT_SINGLE_BLOCK REP XMM0
    274                vpxor    (arg1), \XMM0, \XMM0
    275               i = 1
    276               setreg
    277.rep \REP
    278                vaesenc  16*i(arg1), \XMM0, \XMM0
    279               i = (i+1)
    280               setreg
    281.endr
    282                vaesenclast 16*i(arg1), \XMM0, \XMM0
    283.endm
    284
    285# combined for GCM encrypt and decrypt functions
    286# clobbering all xmm registers
    287# clobbering r10, r11, r12, r13, r15, rax
    288.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
    289        vmovdqu AadHash(arg2), %xmm8
    290        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
    291        add arg5, InLen(arg2)
    292
    293        # initialize the data pointer offset as zero
    294        xor     %r11d, %r11d
    295
    296        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
    297        sub %r11, arg5
    298
    299        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
    300        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
    301
    302        mov     %r13, %r12
    303        shr     $4, %r12
    304        and     $7, %r12
    305        jz      _initial_num_blocks_is_0\@
    306
    307        cmp     $7, %r12
    308        je      _initial_num_blocks_is_7\@
    309        cmp     $6, %r12
    310        je      _initial_num_blocks_is_6\@
    311        cmp     $5, %r12
    312        je      _initial_num_blocks_is_5\@
    313        cmp     $4, %r12
    314        je      _initial_num_blocks_is_4\@
    315        cmp     $3, %r12
    316        je      _initial_num_blocks_is_3\@
    317        cmp     $2, %r12
    318        je      _initial_num_blocks_is_2\@
    319
    320        jmp     _initial_num_blocks_is_1\@
    321
    322_initial_num_blocks_is_7\@:
    323        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    324        sub     $16*7, %r13
    325        jmp     _initial_blocks_encrypted\@
    326
    327_initial_num_blocks_is_6\@:
    328        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    329        sub     $16*6, %r13
    330        jmp     _initial_blocks_encrypted\@
    331
    332_initial_num_blocks_is_5\@:
    333        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    334        sub     $16*5, %r13
    335        jmp     _initial_blocks_encrypted\@
    336
    337_initial_num_blocks_is_4\@:
    338        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    339        sub     $16*4, %r13
    340        jmp     _initial_blocks_encrypted\@
    341
    342_initial_num_blocks_is_3\@:
    343        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    344        sub     $16*3, %r13
    345        jmp     _initial_blocks_encrypted\@
    346
    347_initial_num_blocks_is_2\@:
    348        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    349        sub     $16*2, %r13
    350        jmp     _initial_blocks_encrypted\@
    351
    352_initial_num_blocks_is_1\@:
    353        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    354        sub     $16*1, %r13
    355        jmp     _initial_blocks_encrypted\@
    356
    357_initial_num_blocks_is_0\@:
    358        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
    359
    360
    361_initial_blocks_encrypted\@:
    362        test    %r13, %r13
    363        je      _zero_cipher_left\@
    364
    365        sub     $128, %r13
    366        je      _eight_cipher_left\@
    367
    368
    369
    370
    371        vmovd   %xmm9, %r15d
    372        and     $255, %r15d
    373        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    374
    375
    376_encrypt_by_8_new\@:
    377        cmp     $(255-8), %r15d
    378        jg      _encrypt_by_8\@
    379
    380
    381
    382        add     $8, %r15b
    383        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
    384        add     $128, %r11
    385        sub     $128, %r13
    386        jne     _encrypt_by_8_new\@
    387
    388        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    389        jmp     _eight_cipher_left\@
    390
    391_encrypt_by_8\@:
    392        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    393        add     $8, %r15b
    394        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
    395        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    396        add     $128, %r11
    397        sub     $128, %r13
    398        jne     _encrypt_by_8_new\@
    399
    400        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    401
    402
    403
    404
    405_eight_cipher_left\@:
    406        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
    407
    408
    409_zero_cipher_left\@:
    410        vmovdqu %xmm14, AadHash(arg2)
    411        vmovdqu %xmm9, CurCount(arg2)
    412
    413        # check for 0 length
    414        mov     arg5, %r13
    415        and     $15, %r13                            # r13 = (arg5 mod 16)
    416
    417        je      _multiple_of_16_bytes\@
    418
    419        # handle the last <16 Byte block separately
    420
    421        mov %r13, PBlockLen(arg2)
    422
    423        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
    424        vmovdqu %xmm9, CurCount(arg2)
    425        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    426
    427        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
    428        vmovdqu %xmm9, PBlockEncKey(arg2)
    429
    430        cmp $16, arg5
    431        jge _large_enough_update\@
    432
    433        lea (arg4,%r11,1), %r10
    434        mov %r13, %r12
    435
    436        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
    437
    438        lea     SHIFT_MASK+16(%rip), %r12
    439        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
    440						     # able to shift 16-r13 bytes (r13 is the
    441	# number of bytes in plaintext mod 16)
    442
    443        jmp _final_ghash_mul\@
    444
    445_large_enough_update\@:
    446        sub $16, %r11
    447        add %r13, %r11
    448
    449        # receive the last <16 Byte block
    450        vmovdqu	(arg4, %r11, 1), %xmm1
    451
    452        sub	%r13, %r11
    453        add	$16, %r11
    454
    455        lea	SHIFT_MASK+16(%rip), %r12
    456        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
    457        # (r13 is the number of bytes in plaintext mod 16)
    458        sub	%r13, %r12
    459        # get the appropriate shuffle mask
    460        vmovdqu	(%r12), %xmm2
    461        # shift right 16-r13 bytes
    462        vpshufb  %xmm2, %xmm1, %xmm1
    463
    464_final_ghash_mul\@:
    465        .if  \ENC_DEC ==  DEC
    466        vmovdqa %xmm1, %xmm2
    467        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
    468        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
    469						     # mask out top 16-r13 bytes of xmm9
    470        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
    471        vpand   %xmm1, %xmm2, %xmm2
    472        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
    473        vpxor   %xmm2, %xmm14, %xmm14
    474
    475        vmovdqu %xmm14, AadHash(arg2)
    476        .else
    477        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
    478        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
    479						     # mask out top 16-r13 bytes of xmm9
    480        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
    481        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
    482        vpxor   %xmm9, %xmm14, %xmm14
    483
    484        vmovdqu %xmm14, AadHash(arg2)
    485        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
    486        .endif
    487
    488
    489        #############################
    490        # output r13 Bytes
    491        vmovq   %xmm9, %rax
    492        cmp     $8, %r13
    493        jle     _less_than_8_bytes_left\@
    494
    495        mov     %rax, (arg3 , %r11)
    496        add     $8, %r11
    497        vpsrldq $8, %xmm9, %xmm9
    498        vmovq   %xmm9, %rax
    499        sub     $8, %r13
    500
    501_less_than_8_bytes_left\@:
    502        movb    %al, (arg3 , %r11)
    503        add     $1, %r11
    504        shr     $8, %rax
    505        sub     $1, %r13
    506        jne     _less_than_8_bytes_left\@
    507        #############################
    508
    509_multiple_of_16_bytes\@:
    510.endm
    511
    512
    513# GCM_COMPLETE Finishes update of tag of last partial block
    514# Output: Authorization Tag (AUTH_TAG)
    515# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
    516.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
    517        vmovdqu AadHash(arg2), %xmm14
    518        vmovdqu HashKey(arg2), %xmm13
    519
    520        mov PBlockLen(arg2), %r12
    521        test %r12, %r12
    522        je _partial_done\@
    523
    524	#GHASH computation for the last <16 Byte block
    525        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
    526
    527_partial_done\@:
    528        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
    529        shl     $3, %r12                             # convert into number of bits
    530        vmovd   %r12d, %xmm15                        # len(A) in xmm15
    531
    532        mov InLen(arg2), %r12
    533        shl     $3, %r12                        # len(C) in bits  (*128)
    534        vmovq   %r12, %xmm1
    535        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
    536        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
    537
    538        vpxor   %xmm15, %xmm14, %xmm14
    539        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
    540        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
    541
    542        vmovdqu OrigIV(arg2), %xmm9
    543
    544        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
    545
    546        vpxor   %xmm14, %xmm9, %xmm9
    547
    548
    549
    550_return_T\@:
    551        mov     \AUTH_TAG, %r10              # r10 = authTag
    552        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
    553
    554        cmp     $16, %r11
    555        je      _T_16\@
    556
    557        cmp     $8, %r11
    558        jl      _T_4\@
    559
    560_T_8\@:
    561        vmovq   %xmm9, %rax
    562        mov     %rax, (%r10)
    563        add     $8, %r10
    564        sub     $8, %r11
    565        vpsrldq $8, %xmm9, %xmm9
    566        test    %r11, %r11
    567        je     _return_T_done\@
    568_T_4\@:
    569        vmovd   %xmm9, %eax
    570        mov     %eax, (%r10)
    571        add     $4, %r10
    572        sub     $4, %r11
    573        vpsrldq     $4, %xmm9, %xmm9
    574        test    %r11, %r11
    575        je     _return_T_done\@
    576_T_123\@:
    577        vmovd     %xmm9, %eax
    578        cmp     $2, %r11
    579        jl     _T_1\@
    580        mov     %ax, (%r10)
    581        cmp     $2, %r11
    582        je     _return_T_done\@
    583        add     $2, %r10
    584        sar     $16, %eax
    585_T_1\@:
    586        mov     %al, (%r10)
    587        jmp     _return_T_done\@
    588
    589_T_16\@:
    590        vmovdqu %xmm9, (%r10)
    591
    592_return_T_done\@:
    593.endm
    594
    595.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
    596
    597	mov     \AAD, %r10                      # r10 = AAD
    598	mov     \AADLEN, %r12                      # r12 = aadLen
    599
    600
    601	mov     %r12, %r11
    602
    603	vpxor   \T8, \T8, \T8
    604	vpxor   \T7, \T7, \T7
    605	cmp     $16, %r11
    606	jl      _get_AAD_rest8\@
    607_get_AAD_blocks\@:
    608	vmovdqu (%r10), \T7
    609	vpshufb SHUF_MASK(%rip), \T7, \T7
    610	vpxor   \T7, \T8, \T8
    611	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
    612	add     $16, %r10
    613	sub     $16, %r12
    614	sub     $16, %r11
    615	cmp     $16, %r11
    616	jge     _get_AAD_blocks\@
    617	vmovdqu \T8, \T7
    618	test    %r11, %r11
    619	je      _get_AAD_done\@
    620
    621	vpxor   \T7, \T7, \T7
    622
    623	/* read the last <16B of AAD. since we have at least 4B of
    624	data right after the AAD (the ICV, and maybe some CT), we can
    625	read 4B/8B blocks safely, and then get rid of the extra stuff */
    626_get_AAD_rest8\@:
    627	cmp     $4, %r11
    628	jle     _get_AAD_rest4\@
    629	movq    (%r10), \T1
    630	add     $8, %r10
    631	sub     $8, %r11
    632	vpslldq $8, \T1, \T1
    633	vpsrldq $8, \T7, \T7
    634	vpxor   \T1, \T7, \T7
    635	jmp     _get_AAD_rest8\@
    636_get_AAD_rest4\@:
    637	test    %r11, %r11
    638	jle      _get_AAD_rest0\@
    639	mov     (%r10), %eax
    640	movq    %rax, \T1
    641	add     $4, %r10
    642	sub     $4, %r11
    643	vpslldq $12, \T1, \T1
    644	vpsrldq $4, \T7, \T7
    645	vpxor   \T1, \T7, \T7
    646_get_AAD_rest0\@:
    647	/* finalize: shift out the extra bytes we read, and align
    648	left. since pslldq can only shift by an immediate, we use
    649	vpshufb and an array of shuffle masks */
    650	movq    %r12, %r11
    651	salq    $4, %r11
    652	vmovdqu  aad_shift_arr(%r11), \T1
    653	vpshufb \T1, \T7, \T7
    654_get_AAD_rest_final\@:
    655	vpshufb SHUF_MASK(%rip), \T7, \T7
    656	vpxor   \T8, \T7, \T7
    657	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
    658
    659_get_AAD_done\@:
    660        vmovdqu \T7, AadHash(arg2)
    661.endm
    662
    663.macro INIT GHASH_MUL PRECOMPUTE
    664        mov arg6, %r11
    665        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
    666        xor %r11d, %r11d
    667        mov %r11, InLen(arg2) # ctx_data.in_length = 0
    668
    669        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
    670        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
    671        mov arg3, %rax
    672        movdqu (%rax), %xmm0
    673        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
    674
    675        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
    676        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
    677
    678        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
    679
    680        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
    681        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
    682        vmovdqa  %xmm6, %xmm2
    683        vpsllq   $1, %xmm6, %xmm6
    684        vpsrlq   $63, %xmm2, %xmm2
    685        vmovdqa  %xmm2, %xmm1
    686        vpslldq  $8, %xmm2, %xmm2
    687        vpsrldq  $8, %xmm1, %xmm1
    688        vpor     %xmm2, %xmm6, %xmm6
    689        #reduction
    690        vpshufd  $0b00100100, %xmm1, %xmm2
    691        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
    692        vpand    POLY(%rip), %xmm2, %xmm2
    693        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
    694        #######################################################################
    695        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
    696
    697        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
    698
    699        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
    700.endm
    701
    702
    703# Reads DLEN bytes starting at DPTR and stores in XMMDst
    704# where 0 < DLEN < 16
    705# Clobbers %rax, DLEN
    706.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
    707        vpxor \XMMDst, \XMMDst, \XMMDst
    708
    709        cmp $8, \DLEN
    710        jl _read_lt8_\@
    711        mov (\DPTR), %rax
    712        vpinsrq $0, %rax, \XMMDst, \XMMDst
    713        sub $8, \DLEN
    714        jz _done_read_partial_block_\@
    715        xor %eax, %eax
    716_read_next_byte_\@:
    717        shl $8, %rax
    718        mov 7(\DPTR, \DLEN, 1), %al
    719        dec \DLEN
    720        jnz _read_next_byte_\@
    721        vpinsrq $1, %rax, \XMMDst, \XMMDst
    722        jmp _done_read_partial_block_\@
    723_read_lt8_\@:
    724        xor %eax, %eax
    725_read_next_byte_lt8_\@:
    726        shl $8, %rax
    727        mov -1(\DPTR, \DLEN, 1), %al
    728        dec \DLEN
    729        jnz _read_next_byte_lt8_\@
    730        vpinsrq $0, %rax, \XMMDst, \XMMDst
    731_done_read_partial_block_\@:
    732.endm
    733
    734# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
    735# between update calls.
    736# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
    737# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
    738# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
    739.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
    740        AAD_HASH ENC_DEC
    741        mov 	PBlockLen(arg2), %r13
    742        test	%r13, %r13
    743        je	_partial_block_done_\@	# Leave Macro if no partial blocks
    744        # Read in input data without over reading
    745        cmp	$16, \PLAIN_CYPH_LEN
    746        jl	_fewer_than_16_bytes_\@
    747        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
    748        jmp	_data_read_\@
    749
    750_fewer_than_16_bytes_\@:
    751        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
    752        mov	\PLAIN_CYPH_LEN, %r12
    753        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
    754
    755        mov PBlockLen(arg2), %r13
    756
    757_data_read_\@:				# Finished reading in data
    758
    759        vmovdqu	PBlockEncKey(arg2), %xmm9
    760        vmovdqu	HashKey(arg2), %xmm13
    761
    762        lea	SHIFT_MASK(%rip), %r12
    763
    764        # adjust the shuffle mask pointer to be able to shift r13 bytes
    765        # r16-r13 is the number of bytes in plaintext mod 16)
    766        add	%r13, %r12
    767        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
    768        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
    769
    770.if  \ENC_DEC ==  DEC
    771        vmovdqa	%xmm1, %xmm3
    772        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
    773
    774        mov	\PLAIN_CYPH_LEN, %r10
    775        add	%r13, %r10
    776        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
    777        sub	$16, %r10
    778        # Determine if if partial block is not being filled and
    779        # shift mask accordingly
    780        jge	_no_extra_mask_1_\@
    781        sub	%r10, %r12
    782_no_extra_mask_1_\@:
    783
    784        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
    785        # get the appropriate mask to mask out bottom r13 bytes of xmm9
    786        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
    787
    788        vpand	%xmm1, %xmm3, %xmm3
    789        vmovdqa	SHUF_MASK(%rip), %xmm10
    790        vpshufb	%xmm10, %xmm3, %xmm3
    791        vpshufb	%xmm2, %xmm3, %xmm3
    792        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
    793
    794        test	%r10, %r10
    795        jl	_partial_incomplete_1_\@
    796
    797        # GHASH computation for the last <16 Byte block
    798        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
    799        xor	%eax,%eax
    800
    801        mov	%rax, PBlockLen(arg2)
    802        jmp	_dec_done_\@
    803_partial_incomplete_1_\@:
    804        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
    805_dec_done_\@:
    806        vmovdqu	\AAD_HASH, AadHash(arg2)
    807.else
    808        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
    809
    810        mov	\PLAIN_CYPH_LEN, %r10
    811        add	%r13, %r10
    812        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
    813        sub	$16, %r10
    814        # Determine if if partial block is not being filled and
    815        # shift mask accordingly
    816        jge	_no_extra_mask_2_\@
    817        sub	%r10, %r12
    818_no_extra_mask_2_\@:
    819
    820        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
    821        # get the appropriate mask to mask out bottom r13 bytes of xmm9
    822        vpand	%xmm1, %xmm9, %xmm9
    823
    824        vmovdqa	SHUF_MASK(%rip), %xmm1
    825        vpshufb %xmm1, %xmm9, %xmm9
    826        vpshufb %xmm2, %xmm9, %xmm9
    827        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
    828
    829        test	%r10, %r10
    830        jl	_partial_incomplete_2_\@
    831
    832        # GHASH computation for the last <16 Byte block
    833        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
    834        xor	%eax,%eax
    835
    836        mov	%rax, PBlockLen(arg2)
    837        jmp	_encode_done_\@
    838_partial_incomplete_2_\@:
    839        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
    840_encode_done_\@:
    841        vmovdqu	\AAD_HASH, AadHash(arg2)
    842
    843        vmovdqa	SHUF_MASK(%rip), %xmm10
    844        # shuffle xmm9 back to output as ciphertext
    845        vpshufb	%xmm10, %xmm9, %xmm9
    846        vpshufb	%xmm2, %xmm9, %xmm9
    847.endif
    848        # output encrypted Bytes
    849        test	%r10, %r10
    850        jl	_partial_fill_\@
    851        mov	%r13, %r12
    852        mov	$16, %r13
    853        # Set r13 to be the number of bytes to write out
    854        sub	%r12, %r13
    855        jmp	_count_set_\@
    856_partial_fill_\@:
    857        mov	\PLAIN_CYPH_LEN, %r13
    858_count_set_\@:
    859        vmovdqa	%xmm9, %xmm0
    860        vmovq	%xmm0, %rax
    861        cmp	$8, %r13
    862        jle	_less_than_8_bytes_left_\@
    863
    864        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
    865        add	$8, \DATA_OFFSET
    866        psrldq	$8, %xmm0
    867        vmovq	%xmm0, %rax
    868        sub	$8, %r13
    869_less_than_8_bytes_left_\@:
    870        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
    871        add	$1, \DATA_OFFSET
    872        shr	$8, %rax
    873        sub	$1, %r13
    874        jne	_less_than_8_bytes_left_\@
    875_partial_block_done_\@:
    876.endm # PARTIAL_BLOCK
    877
    878###############################################################################
    879# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
    880# Input: A and B (128-bits each, bit-reflected)
    881# Output: C = A*B*x mod poly, (i.e. >>1 )
    882# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
    883# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
    884###############################################################################
    885.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
    886
    887        vpshufd         $0b01001110, \GH, \T2
    888        vpshufd         $0b01001110, \HK, \T3
    889        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
    890        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
    891
    892        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
    893        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
    894        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
    895        vpxor           \GH, \T2,\T2
    896        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
    897
    898        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
    899        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
    900        vpxor           \T3, \GH, \GH
    901        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
    902
    903        #first phase of the reduction
    904        vpslld  $31, \GH, \T2                   # packed right shifting << 31
    905        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
    906        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
    907
    908        vpxor   \T3, \T2, \T2                   # xor the shifted versions
    909        vpxor   \T4, \T2, \T2
    910
    911        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
    912
    913        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
    914        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
    915
    916        #second phase of the reduction
    917
    918        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
    919        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
    920        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
    921        vpxor   \T3, \T2, \T2                   # xor the shifted versions
    922        vpxor   \T4, \T2, \T2
    923
    924        vpxor   \T5, \T2, \T2
    925        vpxor   \T2, \GH, \GH
    926        vpxor   \T1, \GH, \GH                   # the result is in GH
    927
    928
    929.endm
    930
    931.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
    932
    933        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
    934        vmovdqa  \HK, \T5
    935
    936        vpshufd  $0b01001110, \T5, \T1
    937        vpxor    \T5, \T1, \T1
    938        vmovdqu  \T1, HashKey_k(arg2)
    939
    940        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
    941        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
    942        vpshufd  $0b01001110, \T5, \T1
    943        vpxor    \T5, \T1, \T1
    944        vmovdqu  \T1, HashKey_2_k(arg2)
    945
    946        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
    947        vmovdqu  \T5, HashKey_3(arg2)
    948        vpshufd  $0b01001110, \T5, \T1
    949        vpxor    \T5, \T1, \T1
    950        vmovdqu  \T1, HashKey_3_k(arg2)
    951
    952        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
    953        vmovdqu  \T5, HashKey_4(arg2)
    954        vpshufd  $0b01001110, \T5, \T1
    955        vpxor    \T5, \T1, \T1
    956        vmovdqu  \T1, HashKey_4_k(arg2)
    957
    958        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
    959        vmovdqu  \T5, HashKey_5(arg2)
    960        vpshufd  $0b01001110, \T5, \T1
    961        vpxor    \T5, \T1, \T1
    962        vmovdqu  \T1, HashKey_5_k(arg2)
    963
    964        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
    965        vmovdqu  \T5, HashKey_6(arg2)
    966        vpshufd  $0b01001110, \T5, \T1
    967        vpxor    \T5, \T1, \T1
    968        vmovdqu  \T1, HashKey_6_k(arg2)
    969
    970        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
    971        vmovdqu  \T5, HashKey_7(arg2)
    972        vpshufd  $0b01001110, \T5, \T1
    973        vpxor    \T5, \T1, \T1
    974        vmovdqu  \T1, HashKey_7_k(arg2)
    975
    976        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
    977        vmovdqu  \T5, HashKey_8(arg2)
    978        vpshufd  $0b01001110, \T5, \T1
    979        vpxor    \T5, \T1, \T1
    980        vmovdqu  \T1, HashKey_8_k(arg2)
    981
    982.endm
    983
    984## if a = number of total plaintext bytes
    985## b = floor(a/16)
    986## num_initial_blocks = b mod 4#
    987## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
    988## r10, r11, r12, rax are clobbered
    989## arg1, arg2, arg3, arg4 are used as pointers only, not modified
    990
    991.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
    992	i = (8-\num_initial_blocks)
    993	setreg
    994        vmovdqu AadHash(arg2), reg_i
    995
    996	# start AES for num_initial_blocks blocks
    997	vmovdqu CurCount(arg2), \CTR
    998
    999	i = (9-\num_initial_blocks)
   1000	setreg
   1001.rep \num_initial_blocks
   1002                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
   1003                vmovdqa \CTR, reg_i
   1004                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
   1005	i = (i+1)
   1006	setreg
   1007.endr
   1008
   1009	vmovdqa  (arg1), \T_key
   1010	i = (9-\num_initial_blocks)
   1011	setreg
   1012.rep \num_initial_blocks
   1013                vpxor   \T_key, reg_i, reg_i
   1014	i = (i+1)
   1015	setreg
   1016.endr
   1017
   1018       j = 1
   1019       setreg
   1020.rep \REP
   1021       vmovdqa  16*j(arg1), \T_key
   1022	i = (9-\num_initial_blocks)
   1023	setreg
   1024.rep \num_initial_blocks
   1025        vaesenc \T_key, reg_i, reg_i
   1026	i = (i+1)
   1027	setreg
   1028.endr
   1029
   1030       j = (j+1)
   1031       setreg
   1032.endr
   1033
   1034	vmovdqa  16*j(arg1), \T_key
   1035	i = (9-\num_initial_blocks)
   1036	setreg
   1037.rep \num_initial_blocks
   1038        vaesenclast      \T_key, reg_i, reg_i
   1039	i = (i+1)
   1040	setreg
   1041.endr
   1042
   1043	i = (9-\num_initial_blocks)
   1044	setreg
   1045.rep \num_initial_blocks
   1046                vmovdqu (arg4, %r11), \T1
   1047                vpxor   \T1, reg_i, reg_i
   1048                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
   1049                add     $16, %r11
   1050.if  \ENC_DEC == DEC
   1051                vmovdqa \T1, reg_i
   1052.endif
   1053                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
   1054	i = (i+1)
   1055	setreg
   1056.endr
   1057
   1058
   1059	i = (8-\num_initial_blocks)
   1060	j = (9-\num_initial_blocks)
   1061	setreg
   1062
   1063.rep \num_initial_blocks
   1064        vpxor    reg_i, reg_j, reg_j
   1065        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
   1066	i = (i+1)
   1067	j = (j+1)
   1068	setreg
   1069.endr
   1070        # XMM8 has the combined result here
   1071
   1072        vmovdqa  \XMM8, TMP1(%rsp)
   1073        vmovdqa  \XMM8, \T3
   1074
   1075        cmp     $128, %r13
   1076        jl      _initial_blocks_done\@                  # no need for precomputed constants
   1077
   1078###############################################################################
   1079# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
   1080                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1081                vmovdqa  \CTR, \XMM1
   1082                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
   1083
   1084                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1085                vmovdqa  \CTR, \XMM2
   1086                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
   1087
   1088                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1089                vmovdqa  \CTR, \XMM3
   1090                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
   1091
   1092                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1093                vmovdqa  \CTR, \XMM4
   1094                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
   1095
   1096                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1097                vmovdqa  \CTR, \XMM5
   1098                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
   1099
   1100                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1101                vmovdqa  \CTR, \XMM6
   1102                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
   1103
   1104                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1105                vmovdqa  \CTR, \XMM7
   1106                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
   1107
   1108                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   1109                vmovdqa  \CTR, \XMM8
   1110                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
   1111
   1112                vmovdqa  (arg1), \T_key
   1113                vpxor    \T_key, \XMM1, \XMM1
   1114                vpxor    \T_key, \XMM2, \XMM2
   1115                vpxor    \T_key, \XMM3, \XMM3
   1116                vpxor    \T_key, \XMM4, \XMM4
   1117                vpxor    \T_key, \XMM5, \XMM5
   1118                vpxor    \T_key, \XMM6, \XMM6
   1119                vpxor    \T_key, \XMM7, \XMM7
   1120                vpxor    \T_key, \XMM8, \XMM8
   1121
   1122               i = 1
   1123               setreg
   1124.rep    \REP       # do REP rounds
   1125                vmovdqa  16*i(arg1), \T_key
   1126                vaesenc  \T_key, \XMM1, \XMM1
   1127                vaesenc  \T_key, \XMM2, \XMM2
   1128                vaesenc  \T_key, \XMM3, \XMM3
   1129                vaesenc  \T_key, \XMM4, \XMM4
   1130                vaesenc  \T_key, \XMM5, \XMM5
   1131                vaesenc  \T_key, \XMM6, \XMM6
   1132                vaesenc  \T_key, \XMM7, \XMM7
   1133                vaesenc  \T_key, \XMM8, \XMM8
   1134               i = (i+1)
   1135               setreg
   1136.endr
   1137
   1138                vmovdqa  16*i(arg1), \T_key
   1139                vaesenclast  \T_key, \XMM1, \XMM1
   1140                vaesenclast  \T_key, \XMM2, \XMM2
   1141                vaesenclast  \T_key, \XMM3, \XMM3
   1142                vaesenclast  \T_key, \XMM4, \XMM4
   1143                vaesenclast  \T_key, \XMM5, \XMM5
   1144                vaesenclast  \T_key, \XMM6, \XMM6
   1145                vaesenclast  \T_key, \XMM7, \XMM7
   1146                vaesenclast  \T_key, \XMM8, \XMM8
   1147
   1148                vmovdqu  (arg4, %r11), \T1
   1149                vpxor    \T1, \XMM1, \XMM1
   1150                vmovdqu  \XMM1, (arg3 , %r11)
   1151                .if   \ENC_DEC == DEC
   1152                vmovdqa  \T1, \XMM1
   1153                .endif
   1154
   1155                vmovdqu  16*1(arg4, %r11), \T1
   1156                vpxor    \T1, \XMM2, \XMM2
   1157                vmovdqu  \XMM2, 16*1(arg3 , %r11)
   1158                .if   \ENC_DEC == DEC
   1159                vmovdqa  \T1, \XMM2
   1160                .endif
   1161
   1162                vmovdqu  16*2(arg4, %r11), \T1
   1163                vpxor    \T1, \XMM3, \XMM3
   1164                vmovdqu  \XMM3, 16*2(arg3 , %r11)
   1165                .if   \ENC_DEC == DEC
   1166                vmovdqa  \T1, \XMM3
   1167                .endif
   1168
   1169                vmovdqu  16*3(arg4, %r11), \T1
   1170                vpxor    \T1, \XMM4, \XMM4
   1171                vmovdqu  \XMM4, 16*3(arg3 , %r11)
   1172                .if   \ENC_DEC == DEC
   1173                vmovdqa  \T1, \XMM4
   1174                .endif
   1175
   1176                vmovdqu  16*4(arg4, %r11), \T1
   1177                vpxor    \T1, \XMM5, \XMM5
   1178                vmovdqu  \XMM5, 16*4(arg3 , %r11)
   1179                .if   \ENC_DEC == DEC
   1180                vmovdqa  \T1, \XMM5
   1181                .endif
   1182
   1183                vmovdqu  16*5(arg4, %r11), \T1
   1184                vpxor    \T1, \XMM6, \XMM6
   1185                vmovdqu  \XMM6, 16*5(arg3 , %r11)
   1186                .if   \ENC_DEC == DEC
   1187                vmovdqa  \T1, \XMM6
   1188                .endif
   1189
   1190                vmovdqu  16*6(arg4, %r11), \T1
   1191                vpxor    \T1, \XMM7, \XMM7
   1192                vmovdqu  \XMM7, 16*6(arg3 , %r11)
   1193                .if   \ENC_DEC == DEC
   1194                vmovdqa  \T1, \XMM7
   1195                .endif
   1196
   1197                vmovdqu  16*7(arg4, %r11), \T1
   1198                vpxor    \T1, \XMM8, \XMM8
   1199                vmovdqu  \XMM8, 16*7(arg3 , %r11)
   1200                .if   \ENC_DEC == DEC
   1201                vmovdqa  \T1, \XMM8
   1202                .endif
   1203
   1204                add     $128, %r11
   1205
   1206                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
   1207                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
   1208                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
   1209                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
   1210                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
   1211                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
   1212                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
   1213                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
   1214                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
   1215
   1216###############################################################################
   1217
   1218_initial_blocks_done\@:
   1219
   1220.endm
   1221
   1222# encrypt 8 blocks at a time
   1223# ghash the 8 previously encrypted ciphertext blocks
   1224# arg1, arg2, arg3, arg4 are used as pointers only, not modified
   1225# r11 is the data offset value
   1226.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
   1227
   1228        vmovdqa \XMM1, \T2
   1229        vmovdqa \XMM2, TMP2(%rsp)
   1230        vmovdqa \XMM3, TMP3(%rsp)
   1231        vmovdqa \XMM4, TMP4(%rsp)
   1232        vmovdqa \XMM5, TMP5(%rsp)
   1233        vmovdqa \XMM6, TMP6(%rsp)
   1234        vmovdqa \XMM7, TMP7(%rsp)
   1235        vmovdqa \XMM8, TMP8(%rsp)
   1236
   1237.if \loop_idx == in_order
   1238                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
   1239                vpaddd  ONE(%rip), \XMM1, \XMM2
   1240                vpaddd  ONE(%rip), \XMM2, \XMM3
   1241                vpaddd  ONE(%rip), \XMM3, \XMM4
   1242                vpaddd  ONE(%rip), \XMM4, \XMM5
   1243                vpaddd  ONE(%rip), \XMM5, \XMM6
   1244                vpaddd  ONE(%rip), \XMM6, \XMM7
   1245                vpaddd  ONE(%rip), \XMM7, \XMM8
   1246                vmovdqa \XMM8, \CTR
   1247
   1248                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
   1249                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
   1250                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
   1251                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
   1252                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
   1253                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
   1254                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
   1255                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
   1256.else
   1257                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
   1258                vpaddd  ONEf(%rip), \XMM1, \XMM2
   1259                vpaddd  ONEf(%rip), \XMM2, \XMM3
   1260                vpaddd  ONEf(%rip), \XMM3, \XMM4
   1261                vpaddd  ONEf(%rip), \XMM4, \XMM5
   1262                vpaddd  ONEf(%rip), \XMM5, \XMM6
   1263                vpaddd  ONEf(%rip), \XMM6, \XMM7
   1264                vpaddd  ONEf(%rip), \XMM7, \XMM8
   1265                vmovdqa \XMM8, \CTR
   1266.endif
   1267
   1268
   1269        #######################################################################
   1270
   1271                vmovdqu (arg1), \T1
   1272                vpxor   \T1, \XMM1, \XMM1
   1273                vpxor   \T1, \XMM2, \XMM2
   1274                vpxor   \T1, \XMM3, \XMM3
   1275                vpxor   \T1, \XMM4, \XMM4
   1276                vpxor   \T1, \XMM5, \XMM5
   1277                vpxor   \T1, \XMM6, \XMM6
   1278                vpxor   \T1, \XMM7, \XMM7
   1279                vpxor   \T1, \XMM8, \XMM8
   1280
   1281        #######################################################################
   1282
   1283
   1284
   1285
   1286
   1287                vmovdqu 16*1(arg1), \T1
   1288                vaesenc \T1, \XMM1, \XMM1
   1289                vaesenc \T1, \XMM2, \XMM2
   1290                vaesenc \T1, \XMM3, \XMM3
   1291                vaesenc \T1, \XMM4, \XMM4
   1292                vaesenc \T1, \XMM5, \XMM5
   1293                vaesenc \T1, \XMM6, \XMM6
   1294                vaesenc \T1, \XMM7, \XMM7
   1295                vaesenc \T1, \XMM8, \XMM8
   1296
   1297                vmovdqu 16*2(arg1), \T1
   1298                vaesenc \T1, \XMM1, \XMM1
   1299                vaesenc \T1, \XMM2, \XMM2
   1300                vaesenc \T1, \XMM3, \XMM3
   1301                vaesenc \T1, \XMM4, \XMM4
   1302                vaesenc \T1, \XMM5, \XMM5
   1303                vaesenc \T1, \XMM6, \XMM6
   1304                vaesenc \T1, \XMM7, \XMM7
   1305                vaesenc \T1, \XMM8, \XMM8
   1306
   1307
   1308        #######################################################################
   1309
   1310        vmovdqu         HashKey_8(arg2), \T5
   1311        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
   1312        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
   1313
   1314        vpshufd         $0b01001110, \T2, \T6
   1315        vpxor           \T2, \T6, \T6
   1316
   1317        vmovdqu         HashKey_8_k(arg2), \T5
   1318        vpclmulqdq      $0x00, \T5, \T6, \T6
   1319
   1320                vmovdqu 16*3(arg1), \T1
   1321                vaesenc \T1, \XMM1, \XMM1
   1322                vaesenc \T1, \XMM2, \XMM2
   1323                vaesenc \T1, \XMM3, \XMM3
   1324                vaesenc \T1, \XMM4, \XMM4
   1325                vaesenc \T1, \XMM5, \XMM5
   1326                vaesenc \T1, \XMM6, \XMM6
   1327                vaesenc \T1, \XMM7, \XMM7
   1328                vaesenc \T1, \XMM8, \XMM8
   1329
   1330        vmovdqa         TMP2(%rsp), \T1
   1331        vmovdqu         HashKey_7(arg2), \T5
   1332        vpclmulqdq      $0x11, \T5, \T1, \T3
   1333        vpxor           \T3, \T4, \T4
   1334        vpclmulqdq      $0x00, \T5, \T1, \T3
   1335        vpxor           \T3, \T7, \T7
   1336
   1337        vpshufd         $0b01001110, \T1, \T3
   1338        vpxor           \T1, \T3, \T3
   1339        vmovdqu         HashKey_7_k(arg2), \T5
   1340        vpclmulqdq      $0x10, \T5, \T3, \T3
   1341        vpxor           \T3, \T6, \T6
   1342
   1343                vmovdqu 16*4(arg1), \T1
   1344                vaesenc \T1, \XMM1, \XMM1
   1345                vaesenc \T1, \XMM2, \XMM2
   1346                vaesenc \T1, \XMM3, \XMM3
   1347                vaesenc \T1, \XMM4, \XMM4
   1348                vaesenc \T1, \XMM5, \XMM5
   1349                vaesenc \T1, \XMM6, \XMM6
   1350                vaesenc \T1, \XMM7, \XMM7
   1351                vaesenc \T1, \XMM8, \XMM8
   1352
   1353        #######################################################################
   1354
   1355        vmovdqa         TMP3(%rsp), \T1
   1356        vmovdqu         HashKey_6(arg2), \T5
   1357        vpclmulqdq      $0x11, \T5, \T1, \T3
   1358        vpxor           \T3, \T4, \T4
   1359        vpclmulqdq      $0x00, \T5, \T1, \T3
   1360        vpxor           \T3, \T7, \T7
   1361
   1362        vpshufd         $0b01001110, \T1, \T3
   1363        vpxor           \T1, \T3, \T3
   1364        vmovdqu         HashKey_6_k(arg2), \T5
   1365        vpclmulqdq      $0x10, \T5, \T3, \T3
   1366        vpxor           \T3, \T6, \T6
   1367
   1368                vmovdqu 16*5(arg1), \T1
   1369                vaesenc \T1, \XMM1, \XMM1
   1370                vaesenc \T1, \XMM2, \XMM2
   1371                vaesenc \T1, \XMM3, \XMM3
   1372                vaesenc \T1, \XMM4, \XMM4
   1373                vaesenc \T1, \XMM5, \XMM5
   1374                vaesenc \T1, \XMM6, \XMM6
   1375                vaesenc \T1, \XMM7, \XMM7
   1376                vaesenc \T1, \XMM8, \XMM8
   1377
   1378        vmovdqa         TMP4(%rsp), \T1
   1379        vmovdqu         HashKey_5(arg2), \T5
   1380        vpclmulqdq      $0x11, \T5, \T1, \T3
   1381        vpxor           \T3, \T4, \T4
   1382        vpclmulqdq      $0x00, \T5, \T1, \T3
   1383        vpxor           \T3, \T7, \T7
   1384
   1385        vpshufd         $0b01001110, \T1, \T3
   1386        vpxor           \T1, \T3, \T3
   1387        vmovdqu         HashKey_5_k(arg2), \T5
   1388        vpclmulqdq      $0x10, \T5, \T3, \T3
   1389        vpxor           \T3, \T6, \T6
   1390
   1391                vmovdqu 16*6(arg1), \T1
   1392                vaesenc \T1, \XMM1, \XMM1
   1393                vaesenc \T1, \XMM2, \XMM2
   1394                vaesenc \T1, \XMM3, \XMM3
   1395                vaesenc \T1, \XMM4, \XMM4
   1396                vaesenc \T1, \XMM5, \XMM5
   1397                vaesenc \T1, \XMM6, \XMM6
   1398                vaesenc \T1, \XMM7, \XMM7
   1399                vaesenc \T1, \XMM8, \XMM8
   1400
   1401
   1402        vmovdqa         TMP5(%rsp), \T1
   1403        vmovdqu         HashKey_4(arg2), \T5
   1404        vpclmulqdq      $0x11, \T5, \T1, \T3
   1405        vpxor           \T3, \T4, \T4
   1406        vpclmulqdq      $0x00, \T5, \T1, \T3
   1407        vpxor           \T3, \T7, \T7
   1408
   1409        vpshufd         $0b01001110, \T1, \T3
   1410        vpxor           \T1, \T3, \T3
   1411        vmovdqu         HashKey_4_k(arg2), \T5
   1412        vpclmulqdq      $0x10, \T5, \T3, \T3
   1413        vpxor           \T3, \T6, \T6
   1414
   1415                vmovdqu 16*7(arg1), \T1
   1416                vaesenc \T1, \XMM1, \XMM1
   1417                vaesenc \T1, \XMM2, \XMM2
   1418                vaesenc \T1, \XMM3, \XMM3
   1419                vaesenc \T1, \XMM4, \XMM4
   1420                vaesenc \T1, \XMM5, \XMM5
   1421                vaesenc \T1, \XMM6, \XMM6
   1422                vaesenc \T1, \XMM7, \XMM7
   1423                vaesenc \T1, \XMM8, \XMM8
   1424
   1425        vmovdqa         TMP6(%rsp), \T1
   1426        vmovdqu         HashKey_3(arg2), \T5
   1427        vpclmulqdq      $0x11, \T5, \T1, \T3
   1428        vpxor           \T3, \T4, \T4
   1429        vpclmulqdq      $0x00, \T5, \T1, \T3
   1430        vpxor           \T3, \T7, \T7
   1431
   1432        vpshufd         $0b01001110, \T1, \T3
   1433        vpxor           \T1, \T3, \T3
   1434        vmovdqu         HashKey_3_k(arg2), \T5
   1435        vpclmulqdq      $0x10, \T5, \T3, \T3
   1436        vpxor           \T3, \T6, \T6
   1437
   1438
   1439                vmovdqu 16*8(arg1), \T1
   1440                vaesenc \T1, \XMM1, \XMM1
   1441                vaesenc \T1, \XMM2, \XMM2
   1442                vaesenc \T1, \XMM3, \XMM3
   1443                vaesenc \T1, \XMM4, \XMM4
   1444                vaesenc \T1, \XMM5, \XMM5
   1445                vaesenc \T1, \XMM6, \XMM6
   1446                vaesenc \T1, \XMM7, \XMM7
   1447                vaesenc \T1, \XMM8, \XMM8
   1448
   1449        vmovdqa         TMP7(%rsp), \T1
   1450        vmovdqu         HashKey_2(arg2), \T5
   1451        vpclmulqdq      $0x11, \T5, \T1, \T3
   1452        vpxor           \T3, \T4, \T4
   1453        vpclmulqdq      $0x00, \T5, \T1, \T3
   1454        vpxor           \T3, \T7, \T7
   1455
   1456        vpshufd         $0b01001110, \T1, \T3
   1457        vpxor           \T1, \T3, \T3
   1458        vmovdqu         HashKey_2_k(arg2), \T5
   1459        vpclmulqdq      $0x10, \T5, \T3, \T3
   1460        vpxor           \T3, \T6, \T6
   1461
   1462        #######################################################################
   1463
   1464                vmovdqu 16*9(arg1), \T5
   1465                vaesenc \T5, \XMM1, \XMM1
   1466                vaesenc \T5, \XMM2, \XMM2
   1467                vaesenc \T5, \XMM3, \XMM3
   1468                vaesenc \T5, \XMM4, \XMM4
   1469                vaesenc \T5, \XMM5, \XMM5
   1470                vaesenc \T5, \XMM6, \XMM6
   1471                vaesenc \T5, \XMM7, \XMM7
   1472                vaesenc \T5, \XMM8, \XMM8
   1473
   1474        vmovdqa         TMP8(%rsp), \T1
   1475        vmovdqu         HashKey(arg2), \T5
   1476        vpclmulqdq      $0x11, \T5, \T1, \T3
   1477        vpxor           \T3, \T4, \T4
   1478        vpclmulqdq      $0x00, \T5, \T1, \T3
   1479        vpxor           \T3, \T7, \T7
   1480
   1481        vpshufd         $0b01001110, \T1, \T3
   1482        vpxor           \T1, \T3, \T3
   1483        vmovdqu         HashKey_k(arg2), \T5
   1484        vpclmulqdq      $0x10, \T5, \T3, \T3
   1485        vpxor           \T3, \T6, \T6
   1486
   1487        vpxor           \T4, \T6, \T6
   1488        vpxor           \T7, \T6, \T6
   1489
   1490                vmovdqu 16*10(arg1), \T5
   1491
   1492        i = 11
   1493        setreg
   1494.rep (\REP-9)
   1495
   1496        vaesenc \T5, \XMM1, \XMM1
   1497        vaesenc \T5, \XMM2, \XMM2
   1498        vaesenc \T5, \XMM3, \XMM3
   1499        vaesenc \T5, \XMM4, \XMM4
   1500        vaesenc \T5, \XMM5, \XMM5
   1501        vaesenc \T5, \XMM6, \XMM6
   1502        vaesenc \T5, \XMM7, \XMM7
   1503        vaesenc \T5, \XMM8, \XMM8
   1504
   1505        vmovdqu 16*i(arg1), \T5
   1506        i = i + 1
   1507        setreg
   1508.endr
   1509
   1510	i = 0
   1511	j = 1
   1512	setreg
   1513.rep 8
   1514		vpxor	16*i(arg4, %r11), \T5, \T2
   1515                .if \ENC_DEC == ENC
   1516                vaesenclast     \T2, reg_j, reg_j
   1517                .else
   1518                vaesenclast     \T2, reg_j, \T3
   1519                vmovdqu 16*i(arg4, %r11), reg_j
   1520                vmovdqu \T3, 16*i(arg3, %r11)
   1521                .endif
   1522	i = (i+1)
   1523	j = (j+1)
   1524	setreg
   1525.endr
   1526	#######################################################################
   1527
   1528
   1529	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
   1530	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
   1531	vpxor	\T3, \T7, \T7
   1532	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
   1533
   1534
   1535
   1536	#######################################################################
   1537	#first phase of the reduction
   1538	#######################################################################
   1539        vpslld  $31, \T7, \T2                           # packed right shifting << 31
   1540        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
   1541        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
   1542
   1543        vpxor   \T3, \T2, \T2                           # xor the shifted versions
   1544        vpxor   \T4, \T2, \T2
   1545
   1546        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
   1547
   1548        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
   1549        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
   1550	#######################################################################
   1551                .if \ENC_DEC == ENC
   1552		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
   1553		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
   1554		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
   1555		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
   1556		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
   1557		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
   1558		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
   1559		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
   1560                .endif
   1561
   1562	#######################################################################
   1563	#second phase of the reduction
   1564        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
   1565        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
   1566        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
   1567        vpxor   \T3, \T2, \T2                           # xor the shifted versions
   1568        vpxor   \T4, \T2, \T2
   1569
   1570        vpxor   \T1, \T2, \T2
   1571        vpxor   \T2, \T7, \T7
   1572        vpxor   \T7, \T6, \T6                           # the result is in T6
   1573	#######################################################################
   1574
   1575		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
   1576		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
   1577		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
   1578		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
   1579		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
   1580		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
   1581		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
   1582		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
   1583
   1584
   1585	vpxor	\T6, \XMM1, \XMM1
   1586
   1587
   1588
   1589.endm
   1590
   1591
   1592# GHASH the last 4 ciphertext blocks.
   1593.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
   1594
   1595        ## Karatsuba Method
   1596
   1597
   1598        vpshufd         $0b01001110, \XMM1, \T2
   1599        vpxor           \XMM1, \T2, \T2
   1600        vmovdqu         HashKey_8(arg2), \T5
   1601        vpclmulqdq      $0x11, \T5, \XMM1, \T6
   1602        vpclmulqdq      $0x00, \T5, \XMM1, \T7
   1603
   1604        vmovdqu         HashKey_8_k(arg2), \T3
   1605        vpclmulqdq      $0x00, \T3, \T2, \XMM1
   1606
   1607        ######################
   1608
   1609        vpshufd         $0b01001110, \XMM2, \T2
   1610        vpxor           \XMM2, \T2, \T2
   1611        vmovdqu         HashKey_7(arg2), \T5
   1612        vpclmulqdq      $0x11, \T5, \XMM2, \T4
   1613        vpxor           \T4, \T6, \T6
   1614
   1615        vpclmulqdq      $0x00, \T5, \XMM2, \T4
   1616        vpxor           \T4, \T7, \T7
   1617
   1618        vmovdqu         HashKey_7_k(arg2), \T3
   1619        vpclmulqdq      $0x00, \T3, \T2, \T2
   1620        vpxor           \T2, \XMM1, \XMM1
   1621
   1622        ######################
   1623
   1624        vpshufd         $0b01001110, \XMM3, \T2
   1625        vpxor           \XMM3, \T2, \T2
   1626        vmovdqu         HashKey_6(arg2), \T5
   1627        vpclmulqdq      $0x11, \T5, \XMM3, \T4
   1628        vpxor           \T4, \T6, \T6
   1629
   1630        vpclmulqdq      $0x00, \T5, \XMM3, \T4
   1631        vpxor           \T4, \T7, \T7
   1632
   1633        vmovdqu         HashKey_6_k(arg2), \T3
   1634        vpclmulqdq      $0x00, \T3, \T2, \T2
   1635        vpxor           \T2, \XMM1, \XMM1
   1636
   1637        ######################
   1638
   1639        vpshufd         $0b01001110, \XMM4, \T2
   1640        vpxor           \XMM4, \T2, \T2
   1641        vmovdqu         HashKey_5(arg2), \T5
   1642        vpclmulqdq      $0x11, \T5, \XMM4, \T4
   1643        vpxor           \T4, \T6, \T6
   1644
   1645        vpclmulqdq      $0x00, \T5, \XMM4, \T4
   1646        vpxor           \T4, \T7, \T7
   1647
   1648        vmovdqu         HashKey_5_k(arg2), \T3
   1649        vpclmulqdq      $0x00, \T3, \T2, \T2
   1650        vpxor           \T2, \XMM1, \XMM1
   1651
   1652        ######################
   1653
   1654        vpshufd         $0b01001110, \XMM5, \T2
   1655        vpxor           \XMM5, \T2, \T2
   1656        vmovdqu         HashKey_4(arg2), \T5
   1657        vpclmulqdq      $0x11, \T5, \XMM5, \T4
   1658        vpxor           \T4, \T6, \T6
   1659
   1660        vpclmulqdq      $0x00, \T5, \XMM5, \T4
   1661        vpxor           \T4, \T7, \T7
   1662
   1663        vmovdqu         HashKey_4_k(arg2), \T3
   1664        vpclmulqdq      $0x00, \T3, \T2, \T2
   1665        vpxor           \T2, \XMM1, \XMM1
   1666
   1667        ######################
   1668
   1669        vpshufd         $0b01001110, \XMM6, \T2
   1670        vpxor           \XMM6, \T2, \T2
   1671        vmovdqu         HashKey_3(arg2), \T5
   1672        vpclmulqdq      $0x11, \T5, \XMM6, \T4
   1673        vpxor           \T4, \T6, \T6
   1674
   1675        vpclmulqdq      $0x00, \T5, \XMM6, \T4
   1676        vpxor           \T4, \T7, \T7
   1677
   1678        vmovdqu         HashKey_3_k(arg2), \T3
   1679        vpclmulqdq      $0x00, \T3, \T2, \T2
   1680        vpxor           \T2, \XMM1, \XMM1
   1681
   1682        ######################
   1683
   1684        vpshufd         $0b01001110, \XMM7, \T2
   1685        vpxor           \XMM7, \T2, \T2
   1686        vmovdqu         HashKey_2(arg2), \T5
   1687        vpclmulqdq      $0x11, \T5, \XMM7, \T4
   1688        vpxor           \T4, \T6, \T6
   1689
   1690        vpclmulqdq      $0x00, \T5, \XMM7, \T4
   1691        vpxor           \T4, \T7, \T7
   1692
   1693        vmovdqu         HashKey_2_k(arg2), \T3
   1694        vpclmulqdq      $0x00, \T3, \T2, \T2
   1695        vpxor           \T2, \XMM1, \XMM1
   1696
   1697        ######################
   1698
   1699        vpshufd         $0b01001110, \XMM8, \T2
   1700        vpxor           \XMM8, \T2, \T2
   1701        vmovdqu         HashKey(arg2), \T5
   1702        vpclmulqdq      $0x11, \T5, \XMM8, \T4
   1703        vpxor           \T4, \T6, \T6
   1704
   1705        vpclmulqdq      $0x00, \T5, \XMM8, \T4
   1706        vpxor           \T4, \T7, \T7
   1707
   1708        vmovdqu         HashKey_k(arg2), \T3
   1709        vpclmulqdq      $0x00, \T3, \T2, \T2
   1710
   1711        vpxor           \T2, \XMM1, \XMM1
   1712        vpxor           \T6, \XMM1, \XMM1
   1713        vpxor           \T7, \XMM1, \T2
   1714
   1715
   1716
   1717
   1718        vpslldq $8, \T2, \T4
   1719        vpsrldq $8, \T2, \T2
   1720
   1721        vpxor   \T4, \T7, \T7
   1722        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
   1723				# the accumulated carry-less multiplications
   1724
   1725        #######################################################################
   1726        #first phase of the reduction
   1727        vpslld  $31, \T7, \T2   # packed right shifting << 31
   1728        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
   1729        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
   1730
   1731        vpxor   \T3, \T2, \T2   # xor the shifted versions
   1732        vpxor   \T4, \T2, \T2
   1733
   1734        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
   1735
   1736        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
   1737        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
   1738        #######################################################################
   1739
   1740
   1741        #second phase of the reduction
   1742        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
   1743        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
   1744        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
   1745        vpxor   \T3, \T2, \T2   # xor the shifted versions
   1746        vpxor   \T4, \T2, \T2
   1747
   1748        vpxor   \T1, \T2, \T2
   1749        vpxor   \T2, \T7, \T7
   1750        vpxor   \T7, \T6, \T6   # the result is in T6
   1751
   1752.endm
   1753
   1754#############################################################
   1755#void   aesni_gcm_precomp_avx_gen2
   1756#        (gcm_data     *my_ctx_data,
   1757#         gcm_context_data *data,
   1758#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
   1759#        u8      *iv, /* Pre-counter block j0: 4 byte salt
   1760#			(from Security Association) concatenated with 8 byte
   1761#			Initialisation Vector (from IPSec ESP Payload)
   1762#			concatenated with 0x00000001. 16-byte aligned pointer. */
   1763#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
   1764#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
   1765#############################################################
   1766SYM_FUNC_START(aesni_gcm_init_avx_gen2)
   1767        FUNC_SAVE
   1768        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
   1769        FUNC_RESTORE
   1770        RET
   1771SYM_FUNC_END(aesni_gcm_init_avx_gen2)
   1772
   1773###############################################################################
   1774#void   aesni_gcm_enc_update_avx_gen2(
   1775#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
   1776#        gcm_context_data *data,
   1777#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
   1778#        const   u8 *in, /* Plaintext input */
   1779#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
   1780###############################################################################
   1781SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
   1782        FUNC_SAVE
   1783        mov     keysize, %eax
   1784        cmp     $32, %eax
   1785        je      key_256_enc_update
   1786        cmp     $16, %eax
   1787        je      key_128_enc_update
   1788        # must be 192
   1789        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
   1790        FUNC_RESTORE
   1791        RET
   1792key_128_enc_update:
   1793        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
   1794        FUNC_RESTORE
   1795        RET
   1796key_256_enc_update:
   1797        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
   1798        FUNC_RESTORE
   1799        RET
   1800SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
   1801
   1802###############################################################################
   1803#void   aesni_gcm_dec_update_avx_gen2(
   1804#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
   1805#        gcm_context_data *data,
   1806#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
   1807#        const   u8 *in, /* Ciphertext input */
   1808#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
   1809###############################################################################
   1810SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
   1811        FUNC_SAVE
   1812        mov     keysize,%eax
   1813        cmp     $32, %eax
   1814        je      key_256_dec_update
   1815        cmp     $16, %eax
   1816        je      key_128_dec_update
   1817        # must be 192
   1818        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
   1819        FUNC_RESTORE
   1820        RET
   1821key_128_dec_update:
   1822        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
   1823        FUNC_RESTORE
   1824        RET
   1825key_256_dec_update:
   1826        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
   1827        FUNC_RESTORE
   1828        RET
   1829SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
   1830
   1831###############################################################################
   1832#void   aesni_gcm_finalize_avx_gen2(
   1833#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
   1834#        gcm_context_data *data,
   1835#        u8      *auth_tag, /* Authenticated Tag output. */
   1836#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
   1837#				Valid values are 16 (most likely), 12 or 8. */
   1838###############################################################################
   1839SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
   1840        FUNC_SAVE
   1841        mov	keysize,%eax
   1842        cmp     $32, %eax
   1843        je      key_256_finalize
   1844        cmp     $16, %eax
   1845        je      key_128_finalize
   1846        # must be 192
   1847        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
   1848        FUNC_RESTORE
   1849        RET
   1850key_128_finalize:
   1851        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
   1852        FUNC_RESTORE
   1853        RET
   1854key_256_finalize:
   1855        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
   1856        FUNC_RESTORE
   1857        RET
   1858SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
   1859
   1860###############################################################################
   1861# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
   1862# Input: A and B (128-bits each, bit-reflected)
   1863# Output: C = A*B*x mod poly, (i.e. >>1 )
   1864# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
   1865# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
   1866###############################################################################
   1867.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
   1868
   1869        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
   1870        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
   1871        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
   1872        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
   1873        vpxor           \T3, \GH, \GH
   1874
   1875
   1876        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
   1877        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
   1878
   1879        vpxor           \T3, \T1, \T1
   1880        vpxor           \T2, \GH, \GH
   1881
   1882        #######################################################################
   1883        #first phase of the reduction
   1884        vmovdqa         POLY2(%rip), \T3
   1885
   1886        vpclmulqdq      $0x01, \GH, \T3, \T2
   1887        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
   1888
   1889        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
   1890        #######################################################################
   1891        #second phase of the reduction
   1892        vpclmulqdq      $0x00, \GH, \T3, \T2
   1893        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
   1894
   1895        vpclmulqdq      $0x10, \GH, \T3, \GH
   1896        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
   1897
   1898        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
   1899        #######################################################################
   1900        vpxor           \T1, \GH, \GH          # the result is in GH
   1901
   1902
   1903.endm
   1904
   1905.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
   1906
   1907        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
   1908        vmovdqa  \HK, \T5
   1909        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
   1910        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
   1911
   1912        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
   1913        vmovdqu  \T5, HashKey_3(arg2)
   1914
   1915        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
   1916        vmovdqu  \T5, HashKey_4(arg2)
   1917
   1918        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
   1919        vmovdqu  \T5, HashKey_5(arg2)
   1920
   1921        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
   1922        vmovdqu  \T5, HashKey_6(arg2)
   1923
   1924        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
   1925        vmovdqu  \T5, HashKey_7(arg2)
   1926
   1927        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
   1928        vmovdqu  \T5, HashKey_8(arg2)
   1929
   1930.endm
   1931
   1932## if a = number of total plaintext bytes
   1933## b = floor(a/16)
   1934## num_initial_blocks = b mod 4#
   1935## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
   1936## r10, r11, r12, rax are clobbered
   1937## arg1, arg2, arg3, arg4 are used as pointers only, not modified
   1938
   1939.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
   1940	i = (8-\num_initial_blocks)
   1941	setreg
   1942	vmovdqu AadHash(arg2), reg_i
   1943
   1944	# start AES for num_initial_blocks blocks
   1945	vmovdqu CurCount(arg2), \CTR
   1946
   1947	i = (9-\num_initial_blocks)
   1948	setreg
   1949.rep \num_initial_blocks
   1950                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
   1951                vmovdqa \CTR, reg_i
   1952                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
   1953	i = (i+1)
   1954	setreg
   1955.endr
   1956
   1957	vmovdqa  (arg1), \T_key
   1958	i = (9-\num_initial_blocks)
   1959	setreg
   1960.rep \num_initial_blocks
   1961                vpxor   \T_key, reg_i, reg_i
   1962	i = (i+1)
   1963	setreg
   1964.endr
   1965
   1966	j = 1
   1967	setreg
   1968.rep \REP
   1969	vmovdqa  16*j(arg1), \T_key
   1970	i = (9-\num_initial_blocks)
   1971	setreg
   1972.rep \num_initial_blocks
   1973        vaesenc \T_key, reg_i, reg_i
   1974	i = (i+1)
   1975	setreg
   1976.endr
   1977
   1978	j = (j+1)
   1979	setreg
   1980.endr
   1981
   1982
   1983	vmovdqa  16*j(arg1), \T_key
   1984	i = (9-\num_initial_blocks)
   1985	setreg
   1986.rep \num_initial_blocks
   1987        vaesenclast      \T_key, reg_i, reg_i
   1988	i = (i+1)
   1989	setreg
   1990.endr
   1991
   1992	i = (9-\num_initial_blocks)
   1993	setreg
   1994.rep \num_initial_blocks
   1995                vmovdqu (arg4, %r11), \T1
   1996                vpxor   \T1, reg_i, reg_i
   1997                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
   1998						       # num_initial_blocks blocks
   1999                add     $16, %r11
   2000.if  \ENC_DEC == DEC
   2001                vmovdqa \T1, reg_i
   2002.endif
   2003                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
   2004	i = (i+1)
   2005	setreg
   2006.endr
   2007
   2008
   2009	i = (8-\num_initial_blocks)
   2010	j = (9-\num_initial_blocks)
   2011	setreg
   2012
   2013.rep \num_initial_blocks
   2014        vpxor    reg_i, reg_j, reg_j
   2015        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
   2016	i = (i+1)
   2017	j = (j+1)
   2018	setreg
   2019.endr
   2020        # XMM8 has the combined result here
   2021
   2022        vmovdqa  \XMM8, TMP1(%rsp)
   2023        vmovdqa  \XMM8, \T3
   2024
   2025        cmp     $128, %r13
   2026        jl      _initial_blocks_done\@                  # no need for precomputed constants
   2027
   2028###############################################################################
   2029# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
   2030                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2031                vmovdqa  \CTR, \XMM1
   2032                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
   2033
   2034                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2035                vmovdqa  \CTR, \XMM2
   2036                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
   2037
   2038                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2039                vmovdqa  \CTR, \XMM3
   2040                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
   2041
   2042                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2043                vmovdqa  \CTR, \XMM4
   2044                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
   2045
   2046                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2047                vmovdqa  \CTR, \XMM5
   2048                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
   2049
   2050                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2051                vmovdqa  \CTR, \XMM6
   2052                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
   2053
   2054                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2055                vmovdqa  \CTR, \XMM7
   2056                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
   2057
   2058                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
   2059                vmovdqa  \CTR, \XMM8
   2060                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
   2061
   2062                vmovdqa  (arg1), \T_key
   2063                vpxor    \T_key, \XMM1, \XMM1
   2064                vpxor    \T_key, \XMM2, \XMM2
   2065                vpxor    \T_key, \XMM3, \XMM3
   2066                vpxor    \T_key, \XMM4, \XMM4
   2067                vpxor    \T_key, \XMM5, \XMM5
   2068                vpxor    \T_key, \XMM6, \XMM6
   2069                vpxor    \T_key, \XMM7, \XMM7
   2070                vpxor    \T_key, \XMM8, \XMM8
   2071
   2072		i = 1
   2073		setreg
   2074.rep    \REP       # do REP rounds
   2075                vmovdqa  16*i(arg1), \T_key
   2076                vaesenc  \T_key, \XMM1, \XMM1
   2077                vaesenc  \T_key, \XMM2, \XMM2
   2078                vaesenc  \T_key, \XMM3, \XMM3
   2079                vaesenc  \T_key, \XMM4, \XMM4
   2080                vaesenc  \T_key, \XMM5, \XMM5
   2081                vaesenc  \T_key, \XMM6, \XMM6
   2082                vaesenc  \T_key, \XMM7, \XMM7
   2083                vaesenc  \T_key, \XMM8, \XMM8
   2084		i = (i+1)
   2085		setreg
   2086.endr
   2087
   2088
   2089                vmovdqa  16*i(arg1), \T_key
   2090                vaesenclast  \T_key, \XMM1, \XMM1
   2091                vaesenclast  \T_key, \XMM2, \XMM2
   2092                vaesenclast  \T_key, \XMM3, \XMM3
   2093                vaesenclast  \T_key, \XMM4, \XMM4
   2094                vaesenclast  \T_key, \XMM5, \XMM5
   2095                vaesenclast  \T_key, \XMM6, \XMM6
   2096                vaesenclast  \T_key, \XMM7, \XMM7
   2097                vaesenclast  \T_key, \XMM8, \XMM8
   2098
   2099                vmovdqu  (arg4, %r11), \T1
   2100                vpxor    \T1, \XMM1, \XMM1
   2101                vmovdqu  \XMM1, (arg3 , %r11)
   2102                .if   \ENC_DEC == DEC
   2103                vmovdqa  \T1, \XMM1
   2104                .endif
   2105
   2106                vmovdqu  16*1(arg4, %r11), \T1
   2107                vpxor    \T1, \XMM2, \XMM2
   2108                vmovdqu  \XMM2, 16*1(arg3 , %r11)
   2109                .if   \ENC_DEC == DEC
   2110                vmovdqa  \T1, \XMM2
   2111                .endif
   2112
   2113                vmovdqu  16*2(arg4, %r11), \T1
   2114                vpxor    \T1, \XMM3, \XMM3
   2115                vmovdqu  \XMM3, 16*2(arg3 , %r11)
   2116                .if   \ENC_DEC == DEC
   2117                vmovdqa  \T1, \XMM3
   2118                .endif
   2119
   2120                vmovdqu  16*3(arg4, %r11), \T1
   2121                vpxor    \T1, \XMM4, \XMM4
   2122                vmovdqu  \XMM4, 16*3(arg3 , %r11)
   2123                .if   \ENC_DEC == DEC
   2124                vmovdqa  \T1, \XMM4
   2125                .endif
   2126
   2127                vmovdqu  16*4(arg4, %r11), \T1
   2128                vpxor    \T1, \XMM5, \XMM5
   2129                vmovdqu  \XMM5, 16*4(arg3 , %r11)
   2130                .if   \ENC_DEC == DEC
   2131                vmovdqa  \T1, \XMM5
   2132                .endif
   2133
   2134                vmovdqu  16*5(arg4, %r11), \T1
   2135                vpxor    \T1, \XMM6, \XMM6
   2136                vmovdqu  \XMM6, 16*5(arg3 , %r11)
   2137                .if   \ENC_DEC == DEC
   2138                vmovdqa  \T1, \XMM6
   2139                .endif
   2140
   2141                vmovdqu  16*6(arg4, %r11), \T1
   2142                vpxor    \T1, \XMM7, \XMM7
   2143                vmovdqu  \XMM7, 16*6(arg3 , %r11)
   2144                .if   \ENC_DEC == DEC
   2145                vmovdqa  \T1, \XMM7
   2146                .endif
   2147
   2148                vmovdqu  16*7(arg4, %r11), \T1
   2149                vpxor    \T1, \XMM8, \XMM8
   2150                vmovdqu  \XMM8, 16*7(arg3 , %r11)
   2151                .if   \ENC_DEC == DEC
   2152                vmovdqa  \T1, \XMM8
   2153                .endif
   2154
   2155                add     $128, %r11
   2156
   2157                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
   2158                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
   2159							   # the corresponding ciphertext
   2160                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
   2161                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
   2162                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
   2163                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
   2164                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
   2165                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
   2166                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
   2167
   2168###############################################################################
   2169
   2170_initial_blocks_done\@:
   2171
   2172
   2173.endm
   2174
   2175
   2176
   2177# encrypt 8 blocks at a time
   2178# ghash the 8 previously encrypted ciphertext blocks
   2179# arg1, arg2, arg3, arg4 are used as pointers only, not modified
   2180# r11 is the data offset value
   2181.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
   2182
   2183        vmovdqa \XMM1, \T2
   2184        vmovdqa \XMM2, TMP2(%rsp)
   2185        vmovdqa \XMM3, TMP3(%rsp)
   2186        vmovdqa \XMM4, TMP4(%rsp)
   2187        vmovdqa \XMM5, TMP5(%rsp)
   2188        vmovdqa \XMM6, TMP6(%rsp)
   2189        vmovdqa \XMM7, TMP7(%rsp)
   2190        vmovdqa \XMM8, TMP8(%rsp)
   2191
   2192.if \loop_idx == in_order
   2193                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
   2194                vpaddd  ONE(%rip), \XMM1, \XMM2
   2195                vpaddd  ONE(%rip), \XMM2, \XMM3
   2196                vpaddd  ONE(%rip), \XMM3, \XMM4
   2197                vpaddd  ONE(%rip), \XMM4, \XMM5
   2198                vpaddd  ONE(%rip), \XMM5, \XMM6
   2199                vpaddd  ONE(%rip), \XMM6, \XMM7
   2200                vpaddd  ONE(%rip), \XMM7, \XMM8
   2201                vmovdqa \XMM8, \CTR
   2202
   2203                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
   2204                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
   2205                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
   2206                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
   2207                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
   2208                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
   2209                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
   2210                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
   2211.else
   2212                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
   2213                vpaddd  ONEf(%rip), \XMM1, \XMM2
   2214                vpaddd  ONEf(%rip), \XMM2, \XMM3
   2215                vpaddd  ONEf(%rip), \XMM3, \XMM4
   2216                vpaddd  ONEf(%rip), \XMM4, \XMM5
   2217                vpaddd  ONEf(%rip), \XMM5, \XMM6
   2218                vpaddd  ONEf(%rip), \XMM6, \XMM7
   2219                vpaddd  ONEf(%rip), \XMM7, \XMM8
   2220                vmovdqa \XMM8, \CTR
   2221.endif
   2222
   2223
   2224        #######################################################################
   2225
   2226                vmovdqu (arg1), \T1
   2227                vpxor   \T1, \XMM1, \XMM1
   2228                vpxor   \T1, \XMM2, \XMM2
   2229                vpxor   \T1, \XMM3, \XMM3
   2230                vpxor   \T1, \XMM4, \XMM4
   2231                vpxor   \T1, \XMM5, \XMM5
   2232                vpxor   \T1, \XMM6, \XMM6
   2233                vpxor   \T1, \XMM7, \XMM7
   2234                vpxor   \T1, \XMM8, \XMM8
   2235
   2236        #######################################################################
   2237
   2238
   2239
   2240
   2241
   2242                vmovdqu 16*1(arg1), \T1
   2243                vaesenc \T1, \XMM1, \XMM1
   2244                vaesenc \T1, \XMM2, \XMM2
   2245                vaesenc \T1, \XMM3, \XMM3
   2246                vaesenc \T1, \XMM4, \XMM4
   2247                vaesenc \T1, \XMM5, \XMM5
   2248                vaesenc \T1, \XMM6, \XMM6
   2249                vaesenc \T1, \XMM7, \XMM7
   2250                vaesenc \T1, \XMM8, \XMM8
   2251
   2252                vmovdqu 16*2(arg1), \T1
   2253                vaesenc \T1, \XMM1, \XMM1
   2254                vaesenc \T1, \XMM2, \XMM2
   2255                vaesenc \T1, \XMM3, \XMM3
   2256                vaesenc \T1, \XMM4, \XMM4
   2257                vaesenc \T1, \XMM5, \XMM5
   2258                vaesenc \T1, \XMM6, \XMM6
   2259                vaesenc \T1, \XMM7, \XMM7
   2260                vaesenc \T1, \XMM8, \XMM8
   2261
   2262
   2263        #######################################################################
   2264
   2265        vmovdqu         HashKey_8(arg2), \T5
   2266        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
   2267        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
   2268        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
   2269        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
   2270        vpxor           \T5, \T6, \T6
   2271
   2272                vmovdqu 16*3(arg1), \T1
   2273                vaesenc \T1, \XMM1, \XMM1
   2274                vaesenc \T1, \XMM2, \XMM2
   2275                vaesenc \T1, \XMM3, \XMM3
   2276                vaesenc \T1, \XMM4, \XMM4
   2277                vaesenc \T1, \XMM5, \XMM5
   2278                vaesenc \T1, \XMM6, \XMM6
   2279                vaesenc \T1, \XMM7, \XMM7
   2280                vaesenc \T1, \XMM8, \XMM8
   2281
   2282        vmovdqa         TMP2(%rsp), \T1
   2283        vmovdqu         HashKey_7(arg2), \T5
   2284        vpclmulqdq      $0x11, \T5, \T1, \T3
   2285        vpxor           \T3, \T4, \T4
   2286
   2287        vpclmulqdq      $0x00, \T5, \T1, \T3
   2288        vpxor           \T3, \T7, \T7
   2289
   2290        vpclmulqdq      $0x01, \T5, \T1, \T3
   2291        vpxor           \T3, \T6, \T6
   2292
   2293        vpclmulqdq      $0x10, \T5, \T1, \T3
   2294        vpxor           \T3, \T6, \T6
   2295
   2296                vmovdqu 16*4(arg1), \T1
   2297                vaesenc \T1, \XMM1, \XMM1
   2298                vaesenc \T1, \XMM2, \XMM2
   2299                vaesenc \T1, \XMM3, \XMM3
   2300                vaesenc \T1, \XMM4, \XMM4
   2301                vaesenc \T1, \XMM5, \XMM5
   2302                vaesenc \T1, \XMM6, \XMM6
   2303                vaesenc \T1, \XMM7, \XMM7
   2304                vaesenc \T1, \XMM8, \XMM8
   2305
   2306        #######################################################################
   2307
   2308        vmovdqa         TMP3(%rsp), \T1
   2309        vmovdqu         HashKey_6(arg2), \T5
   2310        vpclmulqdq      $0x11, \T5, \T1, \T3
   2311        vpxor           \T3, \T4, \T4
   2312
   2313        vpclmulqdq      $0x00, \T5, \T1, \T3
   2314        vpxor           \T3, \T7, \T7
   2315
   2316        vpclmulqdq      $0x01, \T5, \T1, \T3
   2317        vpxor           \T3, \T6, \T6
   2318
   2319        vpclmulqdq      $0x10, \T5, \T1, \T3
   2320        vpxor           \T3, \T6, \T6
   2321
   2322                vmovdqu 16*5(arg1), \T1
   2323                vaesenc \T1, \XMM1, \XMM1
   2324                vaesenc \T1, \XMM2, \XMM2
   2325                vaesenc \T1, \XMM3, \XMM3
   2326                vaesenc \T1, \XMM4, \XMM4
   2327                vaesenc \T1, \XMM5, \XMM5
   2328                vaesenc \T1, \XMM6, \XMM6
   2329                vaesenc \T1, \XMM7, \XMM7
   2330                vaesenc \T1, \XMM8, \XMM8
   2331
   2332        vmovdqa         TMP4(%rsp), \T1
   2333        vmovdqu         HashKey_5(arg2), \T5
   2334        vpclmulqdq      $0x11, \T5, \T1, \T3
   2335        vpxor           \T3, \T4, \T4
   2336
   2337        vpclmulqdq      $0x00, \T5, \T1, \T3
   2338        vpxor           \T3, \T7, \T7
   2339
   2340        vpclmulqdq      $0x01, \T5, \T1, \T3
   2341        vpxor           \T3, \T6, \T6
   2342
   2343        vpclmulqdq      $0x10, \T5, \T1, \T3
   2344        vpxor           \T3, \T6, \T6
   2345
   2346                vmovdqu 16*6(arg1), \T1
   2347                vaesenc \T1, \XMM1, \XMM1
   2348                vaesenc \T1, \XMM2, \XMM2
   2349                vaesenc \T1, \XMM3, \XMM3
   2350                vaesenc \T1, \XMM4, \XMM4
   2351                vaesenc \T1, \XMM5, \XMM5
   2352                vaesenc \T1, \XMM6, \XMM6
   2353                vaesenc \T1, \XMM7, \XMM7
   2354                vaesenc \T1, \XMM8, \XMM8
   2355
   2356
   2357        vmovdqa         TMP5(%rsp), \T1
   2358        vmovdqu         HashKey_4(arg2), \T5
   2359        vpclmulqdq      $0x11, \T5, \T1, \T3
   2360        vpxor           \T3, \T4, \T4
   2361
   2362        vpclmulqdq      $0x00, \T5, \T1, \T3
   2363        vpxor           \T3, \T7, \T7
   2364
   2365        vpclmulqdq      $0x01, \T5, \T1, \T3
   2366        vpxor           \T3, \T6, \T6
   2367
   2368        vpclmulqdq      $0x10, \T5, \T1, \T3
   2369        vpxor           \T3, \T6, \T6
   2370
   2371                vmovdqu 16*7(arg1), \T1
   2372                vaesenc \T1, \XMM1, \XMM1
   2373                vaesenc \T1, \XMM2, \XMM2
   2374                vaesenc \T1, \XMM3, \XMM3
   2375                vaesenc \T1, \XMM4, \XMM4
   2376                vaesenc \T1, \XMM5, \XMM5
   2377                vaesenc \T1, \XMM6, \XMM6
   2378                vaesenc \T1, \XMM7, \XMM7
   2379                vaesenc \T1, \XMM8, \XMM8
   2380
   2381        vmovdqa         TMP6(%rsp), \T1
   2382        vmovdqu         HashKey_3(arg2), \T5
   2383        vpclmulqdq      $0x11, \T5, \T1, \T3
   2384        vpxor           \T3, \T4, \T4
   2385
   2386        vpclmulqdq      $0x00, \T5, \T1, \T3
   2387        vpxor           \T3, \T7, \T7
   2388
   2389        vpclmulqdq      $0x01, \T5, \T1, \T3
   2390        vpxor           \T3, \T6, \T6
   2391
   2392        vpclmulqdq      $0x10, \T5, \T1, \T3
   2393        vpxor           \T3, \T6, \T6
   2394
   2395                vmovdqu 16*8(arg1), \T1
   2396                vaesenc \T1, \XMM1, \XMM1
   2397                vaesenc \T1, \XMM2, \XMM2
   2398                vaesenc \T1, \XMM3, \XMM3
   2399                vaesenc \T1, \XMM4, \XMM4
   2400                vaesenc \T1, \XMM5, \XMM5
   2401                vaesenc \T1, \XMM6, \XMM6
   2402                vaesenc \T1, \XMM7, \XMM7
   2403                vaesenc \T1, \XMM8, \XMM8
   2404
   2405        vmovdqa         TMP7(%rsp), \T1
   2406        vmovdqu         HashKey_2(arg2), \T5
   2407        vpclmulqdq      $0x11, \T5, \T1, \T3
   2408        vpxor           \T3, \T4, \T4
   2409
   2410        vpclmulqdq      $0x00, \T5, \T1, \T3
   2411        vpxor           \T3, \T7, \T7
   2412
   2413        vpclmulqdq      $0x01, \T5, \T1, \T3
   2414        vpxor           \T3, \T6, \T6
   2415
   2416        vpclmulqdq      $0x10, \T5, \T1, \T3
   2417        vpxor           \T3, \T6, \T6
   2418
   2419
   2420        #######################################################################
   2421
   2422                vmovdqu 16*9(arg1), \T5
   2423                vaesenc \T5, \XMM1, \XMM1
   2424                vaesenc \T5, \XMM2, \XMM2
   2425                vaesenc \T5, \XMM3, \XMM3
   2426                vaesenc \T5, \XMM4, \XMM4
   2427                vaesenc \T5, \XMM5, \XMM5
   2428                vaesenc \T5, \XMM6, \XMM6
   2429                vaesenc \T5, \XMM7, \XMM7
   2430                vaesenc \T5, \XMM8, \XMM8
   2431
   2432        vmovdqa         TMP8(%rsp), \T1
   2433        vmovdqu         HashKey(arg2), \T5
   2434
   2435        vpclmulqdq      $0x00, \T5, \T1, \T3
   2436        vpxor           \T3, \T7, \T7
   2437
   2438        vpclmulqdq      $0x01, \T5, \T1, \T3
   2439        vpxor           \T3, \T6, \T6
   2440
   2441        vpclmulqdq      $0x10, \T5, \T1, \T3
   2442        vpxor           \T3, \T6, \T6
   2443
   2444        vpclmulqdq      $0x11, \T5, \T1, \T3
   2445        vpxor           \T3, \T4, \T1
   2446
   2447
   2448                vmovdqu 16*10(arg1), \T5
   2449
   2450        i = 11
   2451        setreg
   2452.rep (\REP-9)
   2453        vaesenc \T5, \XMM1, \XMM1
   2454        vaesenc \T5, \XMM2, \XMM2
   2455        vaesenc \T5, \XMM3, \XMM3
   2456        vaesenc \T5, \XMM4, \XMM4
   2457        vaesenc \T5, \XMM5, \XMM5
   2458        vaesenc \T5, \XMM6, \XMM6
   2459        vaesenc \T5, \XMM7, \XMM7
   2460        vaesenc \T5, \XMM8, \XMM8
   2461
   2462        vmovdqu 16*i(arg1), \T5
   2463        i = i + 1
   2464        setreg
   2465.endr
   2466
   2467	i = 0
   2468	j = 1
   2469	setreg
   2470.rep 8
   2471		vpxor	16*i(arg4, %r11), \T5, \T2
   2472                .if \ENC_DEC == ENC
   2473                vaesenclast     \T2, reg_j, reg_j
   2474                .else
   2475                vaesenclast     \T2, reg_j, \T3
   2476                vmovdqu 16*i(arg4, %r11), reg_j
   2477                vmovdqu \T3, 16*i(arg3, %r11)
   2478                .endif
   2479	i = (i+1)
   2480	j = (j+1)
   2481	setreg
   2482.endr
   2483	#######################################################################
   2484
   2485
   2486	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
   2487	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
   2488	vpxor	\T3, \T7, \T7
   2489	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
   2490
   2491
   2492
   2493	#######################################################################
   2494	#first phase of the reduction
   2495	vmovdqa         POLY2(%rip), \T3
   2496
   2497	vpclmulqdq	$0x01, \T7, \T3, \T2
   2498	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
   2499
   2500	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
   2501	#######################################################################
   2502                .if \ENC_DEC == ENC
   2503		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
   2504		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
   2505		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
   2506		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
   2507		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
   2508		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
   2509		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
   2510		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
   2511                .endif
   2512
   2513	#######################################################################
   2514	#second phase of the reduction
   2515	vpclmulqdq	$0x00, \T7, \T3, \T2
   2516	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
   2517
   2518	vpclmulqdq	$0x10, \T7, \T3, \T4
   2519	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
   2520
   2521	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
   2522	#######################################################################
   2523	vpxor		\T4, \T1, \T1			# the result is in T1
   2524
   2525		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
   2526		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
   2527		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
   2528		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
   2529		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
   2530		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
   2531		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
   2532		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
   2533
   2534
   2535	vpxor	\T1, \XMM1, \XMM1
   2536
   2537
   2538
   2539.endm
   2540
   2541
   2542# GHASH the last 4 ciphertext blocks.
   2543.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
   2544
   2545        ## Karatsuba Method
   2546
   2547        vmovdqu         HashKey_8(arg2), \T5
   2548
   2549        vpshufd         $0b01001110, \XMM1, \T2
   2550        vpshufd         $0b01001110, \T5, \T3
   2551        vpxor           \XMM1, \T2, \T2
   2552        vpxor           \T5, \T3, \T3
   2553
   2554        vpclmulqdq      $0x11, \T5, \XMM1, \T6
   2555        vpclmulqdq      $0x00, \T5, \XMM1, \T7
   2556
   2557        vpclmulqdq      $0x00, \T3, \T2, \XMM1
   2558
   2559        ######################
   2560
   2561        vmovdqu         HashKey_7(arg2), \T5
   2562        vpshufd         $0b01001110, \XMM2, \T2
   2563        vpshufd         $0b01001110, \T5, \T3
   2564        vpxor           \XMM2, \T2, \T2
   2565        vpxor           \T5, \T3, \T3
   2566
   2567        vpclmulqdq      $0x11, \T5, \XMM2, \T4
   2568        vpxor           \T4, \T6, \T6
   2569
   2570        vpclmulqdq      $0x00, \T5, \XMM2, \T4
   2571        vpxor           \T4, \T7, \T7
   2572
   2573        vpclmulqdq      $0x00, \T3, \T2, \T2
   2574
   2575        vpxor           \T2, \XMM1, \XMM1
   2576
   2577        ######################
   2578
   2579        vmovdqu         HashKey_6(arg2), \T5
   2580        vpshufd         $0b01001110, \XMM3, \T2
   2581        vpshufd         $0b01001110, \T5, \T3
   2582        vpxor           \XMM3, \T2, \T2
   2583        vpxor           \T5, \T3, \T3
   2584
   2585        vpclmulqdq      $0x11, \T5, \XMM3, \T4
   2586        vpxor           \T4, \T6, \T6
   2587
   2588        vpclmulqdq      $0x00, \T5, \XMM3, \T4
   2589        vpxor           \T4, \T7, \T7
   2590
   2591        vpclmulqdq      $0x00, \T3, \T2, \T2
   2592
   2593        vpxor           \T2, \XMM1, \XMM1
   2594
   2595        ######################
   2596
   2597        vmovdqu         HashKey_5(arg2), \T5
   2598        vpshufd         $0b01001110, \XMM4, \T2
   2599        vpshufd         $0b01001110, \T5, \T3
   2600        vpxor           \XMM4, \T2, \T2
   2601        vpxor           \T5, \T3, \T3
   2602
   2603        vpclmulqdq      $0x11, \T5, \XMM4, \T4
   2604        vpxor           \T4, \T6, \T6
   2605
   2606        vpclmulqdq      $0x00, \T5, \XMM4, \T4
   2607        vpxor           \T4, \T7, \T7
   2608
   2609        vpclmulqdq      $0x00, \T3, \T2, \T2
   2610
   2611        vpxor           \T2, \XMM1, \XMM1
   2612
   2613        ######################
   2614
   2615        vmovdqu         HashKey_4(arg2), \T5
   2616        vpshufd         $0b01001110, \XMM5, \T2
   2617        vpshufd         $0b01001110, \T5, \T3
   2618        vpxor           \XMM5, \T2, \T2
   2619        vpxor           \T5, \T3, \T3
   2620
   2621        vpclmulqdq      $0x11, \T5, \XMM5, \T4
   2622        vpxor           \T4, \T6, \T6
   2623
   2624        vpclmulqdq      $0x00, \T5, \XMM5, \T4
   2625        vpxor           \T4, \T7, \T7
   2626
   2627        vpclmulqdq      $0x00, \T3, \T2, \T2
   2628
   2629        vpxor           \T2, \XMM1, \XMM1
   2630
   2631        ######################
   2632
   2633        vmovdqu         HashKey_3(arg2), \T5
   2634        vpshufd         $0b01001110, \XMM6, \T2
   2635        vpshufd         $0b01001110, \T5, \T3
   2636        vpxor           \XMM6, \T2, \T2
   2637        vpxor           \T5, \T3, \T3
   2638
   2639        vpclmulqdq      $0x11, \T5, \XMM6, \T4
   2640        vpxor           \T4, \T6, \T6
   2641
   2642        vpclmulqdq      $0x00, \T5, \XMM6, \T4
   2643        vpxor           \T4, \T7, \T7
   2644
   2645        vpclmulqdq      $0x00, \T3, \T2, \T2
   2646
   2647        vpxor           \T2, \XMM1, \XMM1
   2648
   2649        ######################
   2650
   2651        vmovdqu         HashKey_2(arg2), \T5
   2652        vpshufd         $0b01001110, \XMM7, \T2
   2653        vpshufd         $0b01001110, \T5, \T3
   2654        vpxor           \XMM7, \T2, \T2
   2655        vpxor           \T5, \T3, \T3
   2656
   2657        vpclmulqdq      $0x11, \T5, \XMM7, \T4
   2658        vpxor           \T4, \T6, \T6
   2659
   2660        vpclmulqdq      $0x00, \T5, \XMM7, \T4
   2661        vpxor           \T4, \T7, \T7
   2662
   2663        vpclmulqdq      $0x00, \T3, \T2, \T2
   2664
   2665        vpxor           \T2, \XMM1, \XMM1
   2666
   2667        ######################
   2668
   2669        vmovdqu         HashKey(arg2), \T5
   2670        vpshufd         $0b01001110, \XMM8, \T2
   2671        vpshufd         $0b01001110, \T5, \T3
   2672        vpxor           \XMM8, \T2, \T2
   2673        vpxor           \T5, \T3, \T3
   2674
   2675        vpclmulqdq      $0x11, \T5, \XMM8, \T4
   2676        vpxor           \T4, \T6, \T6
   2677
   2678        vpclmulqdq      $0x00, \T5, \XMM8, \T4
   2679        vpxor           \T4, \T7, \T7
   2680
   2681        vpclmulqdq      $0x00, \T3, \T2, \T2
   2682
   2683        vpxor           \T2, \XMM1, \XMM1
   2684        vpxor           \T6, \XMM1, \XMM1
   2685        vpxor           \T7, \XMM1, \T2
   2686
   2687
   2688
   2689
   2690        vpslldq $8, \T2, \T4
   2691        vpsrldq $8, \T2, \T2
   2692
   2693        vpxor   \T4, \T7, \T7
   2694        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
   2695						   # accumulated carry-less multiplications
   2696
   2697        #######################################################################
   2698        #first phase of the reduction
   2699        vmovdqa         POLY2(%rip), \T3
   2700
   2701        vpclmulqdq      $0x01, \T7, \T3, \T2
   2702        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
   2703
   2704        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
   2705        #######################################################################
   2706
   2707
   2708        #second phase of the reduction
   2709        vpclmulqdq      $0x00, \T7, \T3, \T2
   2710        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
   2711
   2712        vpclmulqdq      $0x10, \T7, \T3, \T4
   2713        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
   2714
   2715        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
   2716        #######################################################################
   2717        vpxor           \T4, \T6, \T6              # the result is in T6
   2718.endm
   2719
   2720
   2721
   2722#############################################################
   2723#void   aesni_gcm_init_avx_gen4
   2724#        (gcm_data     *my_ctx_data,
   2725#         gcm_context_data *data,
   2726#        u8      *iv, /* Pre-counter block j0: 4 byte salt
   2727#			(from Security Association) concatenated with 8 byte
   2728#			Initialisation Vector (from IPSec ESP Payload)
   2729#			concatenated with 0x00000001. 16-byte aligned pointer. */
   2730#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
   2731#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
   2732#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
   2733#############################################################
   2734SYM_FUNC_START(aesni_gcm_init_avx_gen4)
   2735        FUNC_SAVE
   2736        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
   2737        FUNC_RESTORE
   2738        RET
   2739SYM_FUNC_END(aesni_gcm_init_avx_gen4)
   2740
   2741###############################################################################
   2742#void   aesni_gcm_enc_avx_gen4(
   2743#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
   2744#        gcm_context_data *data,
   2745#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
   2746#        const   u8 *in, /* Plaintext input */
   2747#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
   2748###############################################################################
   2749SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
   2750        FUNC_SAVE
   2751        mov     keysize,%eax
   2752        cmp     $32, %eax
   2753        je      key_256_enc_update4
   2754        cmp     $16, %eax
   2755        je      key_128_enc_update4
   2756        # must be 192
   2757        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
   2758        FUNC_RESTORE
   2759	RET
   2760key_128_enc_update4:
   2761        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
   2762        FUNC_RESTORE
   2763	RET
   2764key_256_enc_update4:
   2765        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
   2766        FUNC_RESTORE
   2767	RET
   2768SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
   2769
   2770###############################################################################
   2771#void   aesni_gcm_dec_update_avx_gen4(
   2772#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
   2773#        gcm_context_data *data,
   2774#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
   2775#        const   u8 *in, /* Ciphertext input */
   2776#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
   2777###############################################################################
   2778SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
   2779        FUNC_SAVE
   2780        mov     keysize,%eax
   2781        cmp     $32, %eax
   2782        je      key_256_dec_update4
   2783        cmp     $16, %eax
   2784        je      key_128_dec_update4
   2785        # must be 192
   2786        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
   2787        FUNC_RESTORE
   2788        RET
   2789key_128_dec_update4:
   2790        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
   2791        FUNC_RESTORE
   2792        RET
   2793key_256_dec_update4:
   2794        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
   2795        FUNC_RESTORE
   2796        RET
   2797SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
   2798
   2799###############################################################################
   2800#void   aesni_gcm_finalize_avx_gen4(
   2801#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
   2802#        gcm_context_data *data,
   2803#        u8      *auth_tag, /* Authenticated Tag output. */
   2804#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
   2805#                              Valid values are 16 (most likely), 12 or 8. */
   2806###############################################################################
   2807SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
   2808        FUNC_SAVE
   2809        mov	keysize,%eax
   2810        cmp     $32, %eax
   2811        je      key_256_finalize4
   2812        cmp     $16, %eax
   2813        je      key_128_finalize4
   2814        # must be 192
   2815        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
   2816        FUNC_RESTORE
   2817        RET
   2818key_128_finalize4:
   2819        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
   2820        FUNC_RESTORE
   2821        RET
   2822key_256_finalize4:
   2823        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
   2824        FUNC_RESTORE
   2825        RET
   2826SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)