cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aesni-intel_asm.S (84373B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Implement AES algorithm in Intel AES-NI instructions.
      4 *
      5 * The white paper of AES-NI instructions can be downloaded from:
      6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
      7 *
      8 * Copyright (C) 2008, Intel Corp.
      9 *    Author: Huang Ying <ying.huang@intel.com>
     10 *            Vinodh Gopal <vinodh.gopal@intel.com>
     11 *            Kahraman Akdemir
     12 *
     13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
     14 * interface for 64-bit kernels.
     15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
     16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
     17 *             Adrian Hoban <adrian.hoban@intel.com>
     18 *             James Guilford (james.guilford@intel.com)
     19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
     20 *             Tadeusz Struk (tadeusz.struk@intel.com)
     21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
     22 *    Copyright (c) 2010, Intel Corporation.
     23 *
     24 * Ported x86_64 version to x86:
     25 *    Author: Mathias Krause <minipli@googlemail.com>
     26 */
     27
     28#include <linux/linkage.h>
     29#include <asm/frame.h>
     30#include <asm/nospec-branch.h>
     31
     32/*
     33 * The following macros are used to move an (un)aligned 16 byte value to/from
     34 * an XMM register.  This can done for either FP or integer values, for FP use
     35 * movaps (move aligned packed single) or integer use movdqa (move double quad
     36 * aligned).  It doesn't make a performance difference which instruction is used
     37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
     38 * shorter, so that is the one we'll use for now. (same for unaligned).
     39 */
     40#define MOVADQ	movaps
     41#define MOVUDQ	movups
     42
     43#ifdef __x86_64__
     44
     45# constants in mergeable sections, linker can reorder and merge
     46.section	.rodata.cst16.POLY, "aM", @progbits, 16
     47.align 16
     48POLY:   .octa 0xC2000000000000000000000000000001
     49.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
     50.align 16
     51TWOONE: .octa 0x00000001000000000000000000000001
     52
     53.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
     54.align 16
     55SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
     56.section	.rodata.cst16.MASK1, "aM", @progbits, 16
     57.align 16
     58MASK1:      .octa 0x0000000000000000ffffffffffffffff
     59.section	.rodata.cst16.MASK2, "aM", @progbits, 16
     60.align 16
     61MASK2:      .octa 0xffffffffffffffff0000000000000000
     62.section	.rodata.cst16.ONE, "aM", @progbits, 16
     63.align 16
     64ONE:        .octa 0x00000000000000000000000000000001
     65.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
     66.align 16
     67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
     68.section	.rodata.cst16.dec, "aM", @progbits, 16
     69.align 16
     70dec:        .octa 0x1
     71.section	.rodata.cst16.enc, "aM", @progbits, 16
     72.align 16
     73enc:        .octa 0x2
     74
     75# order of these constants should not change.
     76# more specifically, ALL_F should follow SHIFT_MASK,
     77# and zero should follow ALL_F
     78.section	.rodata, "a", @progbits
     79.align 16
     80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
     81ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
     82            .octa 0x00000000000000000000000000000000
     83
     84.text
     85
     86
     87#define	STACK_OFFSET    8*3
     88
     89#define AadHash 16*0
     90#define AadLen 16*1
     91#define InLen (16*1)+8
     92#define PBlockEncKey 16*2
     93#define OrigIV 16*3
     94#define CurCount 16*4
     95#define PBlockLen 16*5
     96#define	HashKey		16*6	// store HashKey <<1 mod poly here
     97#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
     98#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
     99#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
    100#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
    101				// bits of  HashKey <<1 mod poly here
    102				//(for Karatsuba purposes)
    103#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
    104				// bits of  HashKey^2 <<1 mod poly here
    105				// (for Karatsuba purposes)
    106#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
    107				// bits of  HashKey^3 <<1 mod poly here
    108				// (for Karatsuba purposes)
    109#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
    110				// bits of  HashKey^4 <<1 mod poly here
    111				// (for Karatsuba purposes)
    112
    113#define arg1 rdi
    114#define arg2 rsi
    115#define arg3 rdx
    116#define arg4 rcx
    117#define arg5 r8
    118#define arg6 r9
    119#define arg7 STACK_OFFSET+8(%rsp)
    120#define arg8 STACK_OFFSET+16(%rsp)
    121#define arg9 STACK_OFFSET+24(%rsp)
    122#define arg10 STACK_OFFSET+32(%rsp)
    123#define arg11 STACK_OFFSET+40(%rsp)
    124#define keysize 2*15*16(%arg1)
    125#endif
    126
    127
    128#define STATE1	%xmm0
    129#define STATE2	%xmm4
    130#define STATE3	%xmm5
    131#define STATE4	%xmm6
    132#define STATE	STATE1
    133#define IN1	%xmm1
    134#define IN2	%xmm7
    135#define IN3	%xmm8
    136#define IN4	%xmm9
    137#define IN	IN1
    138#define KEY	%xmm2
    139#define IV	%xmm3
    140
    141#define BSWAP_MASK %xmm10
    142#define CTR	%xmm11
    143#define INC	%xmm12
    144
    145#define GF128MUL_MASK %xmm7
    146
    147#ifdef __x86_64__
    148#define AREG	%rax
    149#define KEYP	%rdi
    150#define OUTP	%rsi
    151#define UKEYP	OUTP
    152#define INP	%rdx
    153#define LEN	%rcx
    154#define IVP	%r8
    155#define KLEN	%r9d
    156#define T1	%r10
    157#define TKEYP	T1
    158#define T2	%r11
    159#define TCTR_LOW T2
    160#else
    161#define AREG	%eax
    162#define KEYP	%edi
    163#define OUTP	AREG
    164#define UKEYP	OUTP
    165#define INP	%edx
    166#define LEN	%esi
    167#define IVP	%ebp
    168#define KLEN	%ebx
    169#define T1	%ecx
    170#define TKEYP	T1
    171#endif
    172
    173.macro FUNC_SAVE
    174	push	%r12
    175	push	%r13
    176	push	%r14
    177#
    178# states of %xmm registers %xmm6:%xmm15 not saved
    179# all %xmm registers are clobbered
    180#
    181.endm
    182
    183
    184.macro FUNC_RESTORE
    185	pop	%r14
    186	pop	%r13
    187	pop	%r12
    188.endm
    189
    190# Precompute hashkeys.
    191# Input: Hash subkey.
    192# Output: HashKeys stored in gcm_context_data.  Only needs to be called
    193# once per key.
    194# clobbers r12, and tmp xmm registers.
    195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
    196	mov	\SUBKEY, %r12
    197	movdqu	(%r12), \TMP3
    198	movdqa	SHUF_MASK(%rip), \TMP2
    199	pshufb	\TMP2, \TMP3
    200
    201	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
    202
    203	movdqa	\TMP3, \TMP2
    204	psllq	$1, \TMP3
    205	psrlq	$63, \TMP2
    206	movdqa	\TMP2, \TMP1
    207	pslldq	$8, \TMP2
    208	psrldq	$8, \TMP1
    209	por	\TMP2, \TMP3
    210
    211	# reduce HashKey<<1
    212
    213	pshufd	$0x24, \TMP1, \TMP2
    214	pcmpeqd TWOONE(%rip), \TMP2
    215	pand	POLY(%rip), \TMP2
    216	pxor	\TMP2, \TMP3
    217	movdqu	\TMP3, HashKey(%arg2)
    218
    219	movdqa	   \TMP3, \TMP5
    220	pshufd	   $78, \TMP3, \TMP1
    221	pxor	   \TMP3, \TMP1
    222	movdqu	   \TMP1, HashKey_k(%arg2)
    223
    224	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
    225# TMP5 = HashKey^2<<1 (mod poly)
    226	movdqu	   \TMP5, HashKey_2(%arg2)
    227# HashKey_2 = HashKey^2<<1 (mod poly)
    228	pshufd	   $78, \TMP5, \TMP1
    229	pxor	   \TMP5, \TMP1
    230	movdqu	   \TMP1, HashKey_2_k(%arg2)
    231
    232	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
    233# TMP5 = HashKey^3<<1 (mod poly)
    234	movdqu	   \TMP5, HashKey_3(%arg2)
    235	pshufd	   $78, \TMP5, \TMP1
    236	pxor	   \TMP5, \TMP1
    237	movdqu	   \TMP1, HashKey_3_k(%arg2)
    238
    239	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
    240# TMP5 = HashKey^3<<1 (mod poly)
    241	movdqu	   \TMP5, HashKey_4(%arg2)
    242	pshufd	   $78, \TMP5, \TMP1
    243	pxor	   \TMP5, \TMP1
    244	movdqu	   \TMP1, HashKey_4_k(%arg2)
    245.endm
    246
    247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
    248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
    249.macro GCM_INIT Iv SUBKEY AAD AADLEN
    250	mov \AADLEN, %r11
    251	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
    252	xor %r11d, %r11d
    253	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
    254	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
    255	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
    256	mov \Iv, %rax
    257	movdqu (%rax), %xmm0
    258	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
    259
    260	movdqa  SHUF_MASK(%rip), %xmm2
    261	pshufb %xmm2, %xmm0
    262	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
    263
    264	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
    265	movdqu HashKey(%arg2), %xmm13
    266
    267	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
    268	%xmm4, %xmm5, %xmm6
    269.endm
    270
    271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
    272# struct has been initialized by GCM_INIT.
    273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
    274# Clobbers rax, r10-r13, and xmm0-xmm15
    275.macro GCM_ENC_DEC operation
    276	movdqu AadHash(%arg2), %xmm8
    277	movdqu HashKey(%arg2), %xmm13
    278	add %arg5, InLen(%arg2)
    279
    280	xor %r11d, %r11d # initialise the data pointer offset as zero
    281	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
    282
    283	sub %r11, %arg5		# sub partial block data used
    284	mov %arg5, %r13		# save the number of bytes
    285
    286	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
    287	mov %r13, %r12
    288	# Encrypt/Decrypt first few blocks
    289
    290	and	$(3<<4), %r12
    291	jz	_initial_num_blocks_is_0_\@
    292	cmp	$(2<<4), %r12
    293	jb	_initial_num_blocks_is_1_\@
    294	je	_initial_num_blocks_is_2_\@
    295_initial_num_blocks_is_3_\@:
    296	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
    297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
    298	sub	$48, %r13
    299	jmp	_initial_blocks_\@
    300_initial_num_blocks_is_2_\@:
    301	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
    302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
    303	sub	$32, %r13
    304	jmp	_initial_blocks_\@
    305_initial_num_blocks_is_1_\@:
    306	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
    307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
    308	sub	$16, %r13
    309	jmp	_initial_blocks_\@
    310_initial_num_blocks_is_0_\@:
    311	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
    312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
    313_initial_blocks_\@:
    314
    315	# Main loop - Encrypt/Decrypt remaining blocks
    316
    317	test	%r13, %r13
    318	je	_zero_cipher_left_\@
    319	sub	$64, %r13
    320	je	_four_cipher_left_\@
    321_crypt_by_4_\@:
    322	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
    323	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
    324	%xmm7, %xmm8, enc
    325	add	$64, %r11
    326	sub	$64, %r13
    327	jne	_crypt_by_4_\@
    328_four_cipher_left_\@:
    329	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
    330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
    331_zero_cipher_left_\@:
    332	movdqu %xmm8, AadHash(%arg2)
    333	movdqu %xmm0, CurCount(%arg2)
    334
    335	mov	%arg5, %r13
    336	and	$15, %r13			# %r13 = arg5 (mod 16)
    337	je	_multiple_of_16_bytes_\@
    338
    339	mov %r13, PBlockLen(%arg2)
    340
    341	# Handle the last <16 Byte block separately
    342	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
    343	movdqu %xmm0, CurCount(%arg2)
    344	movdqa SHUF_MASK(%rip), %xmm10
    345	pshufb %xmm10, %xmm0
    346
    347	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
    348	movdqu %xmm0, PBlockEncKey(%arg2)
    349
    350	cmp	$16, %arg5
    351	jge _large_enough_update_\@
    352
    353	lea (%arg4,%r11,1), %r10
    354	mov %r13, %r12
    355	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
    356	jmp _data_read_\@
    357
    358_large_enough_update_\@:
    359	sub	$16, %r11
    360	add	%r13, %r11
    361
    362	# receive the last <16 Byte block
    363	movdqu	(%arg4, %r11, 1), %xmm1
    364
    365	sub	%r13, %r11
    366	add	$16, %r11
    367
    368	lea	SHIFT_MASK+16(%rip), %r12
    369	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
    370	# (r13 is the number of bytes in plaintext mod 16)
    371	sub	%r13, %r12
    372	# get the appropriate shuffle mask
    373	movdqu	(%r12), %xmm2
    374	# shift right 16-r13 bytes
    375	pshufb  %xmm2, %xmm1
    376
    377_data_read_\@:
    378	lea ALL_F+16(%rip), %r12
    379	sub %r13, %r12
    380
    381.ifc \operation, dec
    382	movdqa  %xmm1, %xmm2
    383.endif
    384	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
    385	movdqu	(%r12), %xmm1
    386	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
    387	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
    388.ifc \operation, dec
    389	pand    %xmm1, %xmm2
    390	movdqa SHUF_MASK(%rip), %xmm10
    391	pshufb %xmm10 ,%xmm2
    392
    393	pxor %xmm2, %xmm8
    394.else
    395	movdqa SHUF_MASK(%rip), %xmm10
    396	pshufb %xmm10,%xmm0
    397
    398	pxor	%xmm0, %xmm8
    399.endif
    400
    401	movdqu %xmm8, AadHash(%arg2)
    402.ifc \operation, enc
    403	# GHASH computation for the last <16 byte block
    404	movdqa SHUF_MASK(%rip), %xmm10
    405	# shuffle xmm0 back to output as ciphertext
    406	pshufb %xmm10, %xmm0
    407.endif
    408
    409	# Output %r13 bytes
    410	movq %xmm0, %rax
    411	cmp $8, %r13
    412	jle _less_than_8_bytes_left_\@
    413	mov %rax, (%arg3 , %r11, 1)
    414	add $8, %r11
    415	psrldq $8, %xmm0
    416	movq %xmm0, %rax
    417	sub $8, %r13
    418_less_than_8_bytes_left_\@:
    419	mov %al,  (%arg3, %r11, 1)
    420	add $1, %r11
    421	shr $8, %rax
    422	sub $1, %r13
    423	jne _less_than_8_bytes_left_\@
    424_multiple_of_16_bytes_\@:
    425.endm
    426
    427# GCM_COMPLETE Finishes update of tag of last partial block
    428# Output: Authorization Tag (AUTH_TAG)
    429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
    430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
    431	movdqu AadHash(%arg2), %xmm8
    432	movdqu HashKey(%arg2), %xmm13
    433
    434	mov PBlockLen(%arg2), %r12
    435
    436	test %r12, %r12
    437	je _partial_done\@
    438
    439	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
    440
    441_partial_done\@:
    442	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
    443	shl	$3, %r12		  # convert into number of bits
    444	movd	%r12d, %xmm15		  # len(A) in %xmm15
    445	mov InLen(%arg2), %r12
    446	shl     $3, %r12                  # len(C) in bits (*128)
    447	movq    %r12, %xmm1
    448
    449	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
    450	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
    451	pxor	%xmm15, %xmm8
    452	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
    453	# final GHASH computation
    454	movdqa SHUF_MASK(%rip), %xmm10
    455	pshufb %xmm10, %xmm8
    456
    457	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
    458	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
    459	pxor	%xmm8, %xmm0
    460_return_T_\@:
    461	mov	\AUTHTAG, %r10                     # %r10 = authTag
    462	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
    463	cmp	$16, %r11
    464	je	_T_16_\@
    465	cmp	$8, %r11
    466	jl	_T_4_\@
    467_T_8_\@:
    468	movq	%xmm0, %rax
    469	mov	%rax, (%r10)
    470	add	$8, %r10
    471	sub	$8, %r11
    472	psrldq	$8, %xmm0
    473	test	%r11, %r11
    474	je	_return_T_done_\@
    475_T_4_\@:
    476	movd	%xmm0, %eax
    477	mov	%eax, (%r10)
    478	add	$4, %r10
    479	sub	$4, %r11
    480	psrldq	$4, %xmm0
    481	test	%r11, %r11
    482	je	_return_T_done_\@
    483_T_123_\@:
    484	movd	%xmm0, %eax
    485	cmp	$2, %r11
    486	jl	_T_1_\@
    487	mov	%ax, (%r10)
    488	cmp	$2, %r11
    489	je	_return_T_done_\@
    490	add	$2, %r10
    491	sar	$16, %eax
    492_T_1_\@:
    493	mov	%al, (%r10)
    494	jmp	_return_T_done_\@
    495_T_16_\@:
    496	movdqu	%xmm0, (%r10)
    497_return_T_done_\@:
    498.endm
    499
    500#ifdef __x86_64__
    501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
    502*
    503*
    504* Input: A and B (128-bits each, bit-reflected)
    505* Output: C = A*B*x mod poly, (i.e. >>1 )
    506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
    507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
    508*
    509*/
    510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
    511	movdqa	  \GH, \TMP1
    512	pshufd	  $78, \GH, \TMP2
    513	pshufd	  $78, \HK, \TMP3
    514	pxor	  \GH, \TMP2            # TMP2 = a1+a0
    515	pxor	  \HK, \TMP3            # TMP3 = b1+b0
    516	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
    517	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
    518	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
    519	pxor	  \GH, \TMP2
    520	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
    521	movdqa	  \TMP2, \TMP3
    522	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
    523	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
    524	pxor	  \TMP3, \GH
    525	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
    526
    527        # first phase of the reduction
    528
    529	movdqa    \GH, \TMP2
    530	movdqa    \GH, \TMP3
    531	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
    532					# in in order to perform
    533					# independent shifts
    534	pslld     $31, \TMP2            # packed right shift <<31
    535	pslld     $30, \TMP3            # packed right shift <<30
    536	pslld     $25, \TMP4            # packed right shift <<25
    537	pxor      \TMP3, \TMP2          # xor the shifted versions
    538	pxor      \TMP4, \TMP2
    539	movdqa    \TMP2, \TMP5
    540	psrldq    $4, \TMP5             # right shift TMP5 1 DW
    541	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
    542	pxor      \TMP2, \GH
    543
    544        # second phase of the reduction
    545
    546	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
    547					# in in order to perform
    548					# independent shifts
    549	movdqa    \GH,\TMP3
    550	movdqa    \GH,\TMP4
    551	psrld     $1,\TMP2              # packed left shift >>1
    552	psrld     $2,\TMP3              # packed left shift >>2
    553	psrld     $7,\TMP4              # packed left shift >>7
    554	pxor      \TMP3,\TMP2		# xor the shifted versions
    555	pxor      \TMP4,\TMP2
    556	pxor      \TMP5, \TMP2
    557	pxor      \TMP2, \GH
    558	pxor      \TMP1, \GH            # result is in TMP1
    559.endm
    560
    561# Reads DLEN bytes starting at DPTR and stores in XMMDst
    562# where 0 < DLEN < 16
    563# Clobbers %rax, DLEN and XMM1
    564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
    565        cmp $8, \DLEN
    566        jl _read_lt8_\@
    567        mov (\DPTR), %rax
    568        movq %rax, \XMMDst
    569        sub $8, \DLEN
    570        jz _done_read_partial_block_\@
    571	xor %eax, %eax
    572_read_next_byte_\@:
    573        shl $8, %rax
    574        mov 7(\DPTR, \DLEN, 1), %al
    575        dec \DLEN
    576        jnz _read_next_byte_\@
    577        movq %rax, \XMM1
    578	pslldq $8, \XMM1
    579        por \XMM1, \XMMDst
    580	jmp _done_read_partial_block_\@
    581_read_lt8_\@:
    582	xor %eax, %eax
    583_read_next_byte_lt8_\@:
    584        shl $8, %rax
    585        mov -1(\DPTR, \DLEN, 1), %al
    586        dec \DLEN
    587        jnz _read_next_byte_lt8_\@
    588        movq %rax, \XMMDst
    589_done_read_partial_block_\@:
    590.endm
    591
    592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
    593# clobbers r10-11, xmm14
    594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
    595	TMP6 TMP7
    596	MOVADQ	   SHUF_MASK(%rip), %xmm14
    597	mov	   \AAD, %r10		# %r10 = AAD
    598	mov	   \AADLEN, %r11		# %r11 = aadLen
    599	pxor	   \TMP7, \TMP7
    600	pxor	   \TMP6, \TMP6
    601
    602	cmp	   $16, %r11
    603	jl	   _get_AAD_rest\@
    604_get_AAD_blocks\@:
    605	movdqu	   (%r10), \TMP7
    606	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
    607	pxor	   \TMP7, \TMP6
    608	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
    609	add	   $16, %r10
    610	sub	   $16, %r11
    611	cmp	   $16, %r11
    612	jge	   _get_AAD_blocks\@
    613
    614	movdqu	   \TMP6, \TMP7
    615
    616	/* read the last <16B of AAD */
    617_get_AAD_rest\@:
    618	test	   %r11, %r11
    619	je	   _get_AAD_done\@
    620
    621	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
    622	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
    623	pxor	   \TMP6, \TMP7
    624	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
    625	movdqu \TMP7, \TMP6
    626
    627_get_AAD_done\@:
    628	movdqu \TMP6, AadHash(%arg2)
    629.endm
    630
    631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
    632# between update calls.
    633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
    634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
    635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
    636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
    637	AAD_HASH operation
    638	mov 	PBlockLen(%arg2), %r13
    639	test	%r13, %r13
    640	je	_partial_block_done_\@	# Leave Macro if no partial blocks
    641	# Read in input data without over reading
    642	cmp	$16, \PLAIN_CYPH_LEN
    643	jl	_fewer_than_16_bytes_\@
    644	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
    645	jmp	_data_read_\@
    646
    647_fewer_than_16_bytes_\@:
    648	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
    649	mov	\PLAIN_CYPH_LEN, %r12
    650	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
    651
    652	mov PBlockLen(%arg2), %r13
    653
    654_data_read_\@:				# Finished reading in data
    655
    656	movdqu	PBlockEncKey(%arg2), %xmm9
    657	movdqu	HashKey(%arg2), %xmm13
    658
    659	lea	SHIFT_MASK(%rip), %r12
    660
    661	# adjust the shuffle mask pointer to be able to shift r13 bytes
    662	# r16-r13 is the number of bytes in plaintext mod 16)
    663	add	%r13, %r12
    664	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
    665	pshufb	%xmm2, %xmm9		# shift right r13 bytes
    666
    667.ifc \operation, dec
    668	movdqa	%xmm1, %xmm3
    669	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
    670
    671	mov	\PLAIN_CYPH_LEN, %r10
    672	add	%r13, %r10
    673	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
    674	sub	$16, %r10
    675	# Determine if if partial block is not being filled and
    676	# shift mask accordingly
    677	jge	_no_extra_mask_1_\@
    678	sub	%r10, %r12
    679_no_extra_mask_1_\@:
    680
    681	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
    682	# get the appropriate mask to mask out bottom r13 bytes of xmm9
    683	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
    684
    685	pand	%xmm1, %xmm3
    686	movdqa	SHUF_MASK(%rip), %xmm10
    687	pshufb	%xmm10, %xmm3
    688	pshufb	%xmm2, %xmm3
    689	pxor	%xmm3, \AAD_HASH
    690
    691	test	%r10, %r10
    692	jl	_partial_incomplete_1_\@
    693
    694	# GHASH computation for the last <16 Byte block
    695	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
    696	xor	%eax, %eax
    697
    698	mov	%rax, PBlockLen(%arg2)
    699	jmp	_dec_done_\@
    700_partial_incomplete_1_\@:
    701	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
    702_dec_done_\@:
    703	movdqu	\AAD_HASH, AadHash(%arg2)
    704.else
    705	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
    706
    707	mov	\PLAIN_CYPH_LEN, %r10
    708	add	%r13, %r10
    709	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
    710	sub	$16, %r10
    711	# Determine if if partial block is not being filled and
    712	# shift mask accordingly
    713	jge	_no_extra_mask_2_\@
    714	sub	%r10, %r12
    715_no_extra_mask_2_\@:
    716
    717	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
    718	# get the appropriate mask to mask out bottom r13 bytes of xmm9
    719	pand	%xmm1, %xmm9
    720
    721	movdqa	SHUF_MASK(%rip), %xmm1
    722	pshufb	%xmm1, %xmm9
    723	pshufb	%xmm2, %xmm9
    724	pxor	%xmm9, \AAD_HASH
    725
    726	test	%r10, %r10
    727	jl	_partial_incomplete_2_\@
    728
    729	# GHASH computation for the last <16 Byte block
    730	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
    731	xor	%eax, %eax
    732
    733	mov	%rax, PBlockLen(%arg2)
    734	jmp	_encode_done_\@
    735_partial_incomplete_2_\@:
    736	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
    737_encode_done_\@:
    738	movdqu	\AAD_HASH, AadHash(%arg2)
    739
    740	movdqa	SHUF_MASK(%rip), %xmm10
    741	# shuffle xmm9 back to output as ciphertext
    742	pshufb	%xmm10, %xmm9
    743	pshufb	%xmm2, %xmm9
    744.endif
    745	# output encrypted Bytes
    746	test	%r10, %r10
    747	jl	_partial_fill_\@
    748	mov	%r13, %r12
    749	mov	$16, %r13
    750	# Set r13 to be the number of bytes to write out
    751	sub	%r12, %r13
    752	jmp	_count_set_\@
    753_partial_fill_\@:
    754	mov	\PLAIN_CYPH_LEN, %r13
    755_count_set_\@:
    756	movdqa	%xmm9, %xmm0
    757	movq	%xmm0, %rax
    758	cmp	$8, %r13
    759	jle	_less_than_8_bytes_left_\@
    760
    761	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
    762	add	$8, \DATA_OFFSET
    763	psrldq	$8, %xmm0
    764	movq	%xmm0, %rax
    765	sub	$8, %r13
    766_less_than_8_bytes_left_\@:
    767	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
    768	add	$1, \DATA_OFFSET
    769	shr	$8, %rax
    770	sub	$1, %r13
    771	jne	_less_than_8_bytes_left_\@
    772_partial_block_done_\@:
    773.endm # PARTIAL_BLOCK
    774
    775/*
    776* if a = number of total plaintext bytes
    777* b = floor(a/16)
    778* num_initial_blocks = b mod 4
    779* encrypt the initial num_initial_blocks blocks and apply ghash on
    780* the ciphertext
    781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
    782* are clobbered
    783* arg1, %arg2, %arg3 are used as a pointer only, not modified
    784*/
    785
    786
    787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
    788	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
    789	MOVADQ		SHUF_MASK(%rip), %xmm14
    790
    791	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
    792
    793	# start AES for num_initial_blocks blocks
    794
    795	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
    796
    797.if (\i == 5) || (\i == 6) || (\i == 7)
    798
    799	MOVADQ		ONE(%RIP),\TMP1
    800	MOVADQ		0(%arg1),\TMP2
    801.irpc index, \i_seq
    802	paddd		\TMP1, \XMM0                 # INCR Y0
    803.ifc \operation, dec
    804        movdqa     \XMM0, %xmm\index
    805.else
    806	MOVADQ		\XMM0, %xmm\index
    807.endif
    808	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
    809	pxor		\TMP2, %xmm\index
    810.endr
    811	lea	0x10(%arg1),%r10
    812	mov	keysize,%eax
    813	shr	$2,%eax				# 128->4, 192->6, 256->8
    814	add	$5,%eax			      # 128->9, 192->11, 256->13
    815
    816aes_loop_initial_\@:
    817	MOVADQ	(%r10),\TMP1
    818.irpc	index, \i_seq
    819	aesenc	\TMP1, %xmm\index
    820.endr
    821	add	$16,%r10
    822	sub	$1,%eax
    823	jnz	aes_loop_initial_\@
    824
    825	MOVADQ	(%r10), \TMP1
    826.irpc index, \i_seq
    827	aesenclast \TMP1, %xmm\index         # Last Round
    828.endr
    829.irpc index, \i_seq
    830	movdqu	   (%arg4 , %r11, 1), \TMP1
    831	pxor	   \TMP1, %xmm\index
    832	movdqu	   %xmm\index, (%arg3 , %r11, 1)
    833	# write back plaintext/ciphertext for num_initial_blocks
    834	add	   $16, %r11
    835
    836.ifc \operation, dec
    837	movdqa     \TMP1, %xmm\index
    838.endif
    839	pshufb	   %xmm14, %xmm\index
    840
    841		# prepare plaintext/ciphertext for GHASH computation
    842.endr
    843.endif
    844
    845        # apply GHASH on num_initial_blocks blocks
    846
    847.if \i == 5
    848        pxor       %xmm5, %xmm6
    849	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
    850        pxor       %xmm6, %xmm7
    851	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
    852        pxor       %xmm7, %xmm8
    853	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
    854.elseif \i == 6
    855        pxor       %xmm6, %xmm7
    856	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
    857        pxor       %xmm7, %xmm8
    858	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
    859.elseif \i == 7
    860        pxor       %xmm7, %xmm8
    861	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
    862.endif
    863	cmp	   $64, %r13
    864	jl	_initial_blocks_done\@
    865	# no need for precomputed values
    866/*
    867*
    868* Precomputations for HashKey parallel with encryption of first 4 blocks.
    869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
    870*/
    871	MOVADQ	   ONE(%RIP),\TMP1
    872	paddd	   \TMP1, \XMM0              # INCR Y0
    873	MOVADQ	   \XMM0, \XMM1
    874	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
    875
    876	paddd	   \TMP1, \XMM0              # INCR Y0
    877	MOVADQ	   \XMM0, \XMM2
    878	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
    879
    880	paddd	   \TMP1, \XMM0              # INCR Y0
    881	MOVADQ	   \XMM0, \XMM3
    882	pshufb %xmm14, \XMM3        # perform a 16 byte swap
    883
    884	paddd	   \TMP1, \XMM0              # INCR Y0
    885	MOVADQ	   \XMM0, \XMM4
    886	pshufb %xmm14, \XMM4        # perform a 16 byte swap
    887
    888	MOVADQ	   0(%arg1),\TMP1
    889	pxor	   \TMP1, \XMM1
    890	pxor	   \TMP1, \XMM2
    891	pxor	   \TMP1, \XMM3
    892	pxor	   \TMP1, \XMM4
    893.irpc index, 1234 # do 4 rounds
    894	movaps 0x10*\index(%arg1), \TMP1
    895	aesenc	   \TMP1, \XMM1
    896	aesenc	   \TMP1, \XMM2
    897	aesenc	   \TMP1, \XMM3
    898	aesenc	   \TMP1, \XMM4
    899.endr
    900.irpc index, 56789 # do next 5 rounds
    901	movaps 0x10*\index(%arg1), \TMP1
    902	aesenc	   \TMP1, \XMM1
    903	aesenc	   \TMP1, \XMM2
    904	aesenc	   \TMP1, \XMM3
    905	aesenc	   \TMP1, \XMM4
    906.endr
    907	lea	   0xa0(%arg1),%r10
    908	mov	   keysize,%eax
    909	shr	   $2,%eax			# 128->4, 192->6, 256->8
    910	sub	   $4,%eax			# 128->0, 192->2, 256->4
    911	jz	   aes_loop_pre_done\@
    912
    913aes_loop_pre_\@:
    914	MOVADQ	   (%r10),\TMP2
    915.irpc	index, 1234
    916	aesenc	   \TMP2, %xmm\index
    917.endr
    918	add	   $16,%r10
    919	sub	   $1,%eax
    920	jnz	   aes_loop_pre_\@
    921
    922aes_loop_pre_done\@:
    923	MOVADQ	   (%r10), \TMP2
    924	aesenclast \TMP2, \XMM1
    925	aesenclast \TMP2, \XMM2
    926	aesenclast \TMP2, \XMM3
    927	aesenclast \TMP2, \XMM4
    928	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
    929	pxor	   \TMP1, \XMM1
    930.ifc \operation, dec
    931	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
    932	movdqa     \TMP1, \XMM1
    933.endif
    934	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
    935	pxor	   \TMP1, \XMM2
    936.ifc \operation, dec
    937	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
    938	movdqa     \TMP1, \XMM2
    939.endif
    940	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
    941	pxor	   \TMP1, \XMM3
    942.ifc \operation, dec
    943	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
    944	movdqa     \TMP1, \XMM3
    945.endif
    946	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
    947	pxor	   \TMP1, \XMM4
    948.ifc \operation, dec
    949	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
    950	movdqa     \TMP1, \XMM4
    951.else
    952	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
    953	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
    954	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
    955	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
    956.endif
    957
    958	add	   $64, %r11
    959	pshufb %xmm14, \XMM1 # perform a 16 byte swap
    960	pxor	   \XMMDst, \XMM1
    961# combine GHASHed value with the corresponding ciphertext
    962	pshufb %xmm14, \XMM2 # perform a 16 byte swap
    963	pshufb %xmm14, \XMM3 # perform a 16 byte swap
    964	pshufb %xmm14, \XMM4 # perform a 16 byte swap
    965
    966_initial_blocks_done\@:
    967
    968.endm
    969
    970/*
    971* encrypt 4 blocks at a time
    972* ghash the 4 previously encrypted ciphertext blocks
    973* arg1, %arg3, %arg4 are used as pointers only, not modified
    974* %r11 is the data offset value
    975*/
    976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
    977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
    978
    979	movdqa	  \XMM1, \XMM5
    980	movdqa	  \XMM2, \XMM6
    981	movdqa	  \XMM3, \XMM7
    982	movdqa	  \XMM4, \XMM8
    983
    984        movdqa    SHUF_MASK(%rip), %xmm15
    985        # multiply TMP5 * HashKey using karatsuba
    986
    987	movdqa	  \XMM5, \TMP4
    988	pshufd	  $78, \XMM5, \TMP6
    989	pxor	  \XMM5, \TMP6
    990	paddd     ONE(%rip), \XMM0		# INCR CNT
    991	movdqu	  HashKey_4(%arg2), \TMP5
    992	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
    993	movdqa    \XMM0, \XMM1
    994	paddd     ONE(%rip), \XMM0		# INCR CNT
    995	movdqa    \XMM0, \XMM2
    996	paddd     ONE(%rip), \XMM0		# INCR CNT
    997	movdqa    \XMM0, \XMM3
    998	paddd     ONE(%rip), \XMM0		# INCR CNT
    999	movdqa    \XMM0, \XMM4
   1000	pshufb %xmm15, \XMM1	# perform a 16 byte swap
   1001	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
   1002	pshufb %xmm15, \XMM2	# perform a 16 byte swap
   1003	pshufb %xmm15, \XMM3	# perform a 16 byte swap
   1004	pshufb %xmm15, \XMM4	# perform a 16 byte swap
   1005
   1006	pxor	  (%arg1), \XMM1
   1007	pxor	  (%arg1), \XMM2
   1008	pxor	  (%arg1), \XMM3
   1009	pxor	  (%arg1), \XMM4
   1010	movdqu	  HashKey_4_k(%arg2), \TMP5
   1011	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
   1012	movaps 0x10(%arg1), \TMP1
   1013	aesenc	  \TMP1, \XMM1              # Round 1
   1014	aesenc	  \TMP1, \XMM2
   1015	aesenc	  \TMP1, \XMM3
   1016	aesenc	  \TMP1, \XMM4
   1017	movaps 0x20(%arg1), \TMP1
   1018	aesenc	  \TMP1, \XMM1              # Round 2
   1019	aesenc	  \TMP1, \XMM2
   1020	aesenc	  \TMP1, \XMM3
   1021	aesenc	  \TMP1, \XMM4
   1022	movdqa	  \XMM6, \TMP1
   1023	pshufd	  $78, \XMM6, \TMP2
   1024	pxor	  \XMM6, \TMP2
   1025	movdqu	  HashKey_3(%arg2), \TMP5
   1026	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
   1027	movaps 0x30(%arg1), \TMP3
   1028	aesenc    \TMP3, \XMM1              # Round 3
   1029	aesenc    \TMP3, \XMM2
   1030	aesenc    \TMP3, \XMM3
   1031	aesenc    \TMP3, \XMM4
   1032	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
   1033	movaps 0x40(%arg1), \TMP3
   1034	aesenc	  \TMP3, \XMM1              # Round 4
   1035	aesenc	  \TMP3, \XMM2
   1036	aesenc	  \TMP3, \XMM3
   1037	aesenc	  \TMP3, \XMM4
   1038	movdqu	  HashKey_3_k(%arg2), \TMP5
   1039	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1040	movaps 0x50(%arg1), \TMP3
   1041	aesenc	  \TMP3, \XMM1              # Round 5
   1042	aesenc	  \TMP3, \XMM2
   1043	aesenc	  \TMP3, \XMM3
   1044	aesenc	  \TMP3, \XMM4
   1045	pxor	  \TMP1, \TMP4
   1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
   1047	pxor	  \XMM6, \XMM5
   1048	pxor	  \TMP2, \TMP6
   1049	movdqa	  \XMM7, \TMP1
   1050	pshufd	  $78, \XMM7, \TMP2
   1051	pxor	  \XMM7, \TMP2
   1052	movdqu	  HashKey_2(%arg2), \TMP5
   1053
   1054        # Multiply TMP5 * HashKey using karatsuba
   1055
   1056	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
   1057	movaps 0x60(%arg1), \TMP3
   1058	aesenc	  \TMP3, \XMM1              # Round 6
   1059	aesenc	  \TMP3, \XMM2
   1060	aesenc	  \TMP3, \XMM3
   1061	aesenc	  \TMP3, \XMM4
   1062	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
   1063	movaps 0x70(%arg1), \TMP3
   1064	aesenc	  \TMP3, \XMM1              # Round 7
   1065	aesenc	  \TMP3, \XMM2
   1066	aesenc	  \TMP3, \XMM3
   1067	aesenc	  \TMP3, \XMM4
   1068	movdqu	  HashKey_2_k(%arg2), \TMP5
   1069	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1070	movaps 0x80(%arg1), \TMP3
   1071	aesenc	  \TMP3, \XMM1              # Round 8
   1072	aesenc	  \TMP3, \XMM2
   1073	aesenc	  \TMP3, \XMM3
   1074	aesenc	  \TMP3, \XMM4
   1075	pxor	  \TMP1, \TMP4
   1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
   1077	pxor	  \XMM7, \XMM5
   1078	pxor	  \TMP2, \TMP6
   1079
   1080        # Multiply XMM8 * HashKey
   1081        # XMM8 and TMP5 hold the values for the two operands
   1082
   1083	movdqa	  \XMM8, \TMP1
   1084	pshufd	  $78, \XMM8, \TMP2
   1085	pxor	  \XMM8, \TMP2
   1086	movdqu	  HashKey(%arg2), \TMP5
   1087	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
   1088	movaps 0x90(%arg1), \TMP3
   1089	aesenc	  \TMP3, \XMM1             # Round 9
   1090	aesenc	  \TMP3, \XMM2
   1091	aesenc	  \TMP3, \XMM3
   1092	aesenc	  \TMP3, \XMM4
   1093	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
   1094	lea	  0xa0(%arg1),%r10
   1095	mov	  keysize,%eax
   1096	shr	  $2,%eax			# 128->4, 192->6, 256->8
   1097	sub	  $4,%eax			# 128->0, 192->2, 256->4
   1098	jz	  aes_loop_par_enc_done\@
   1099
   1100aes_loop_par_enc\@:
   1101	MOVADQ	  (%r10),\TMP3
   1102.irpc	index, 1234
   1103	aesenc	  \TMP3, %xmm\index
   1104.endr
   1105	add	  $16,%r10
   1106	sub	  $1,%eax
   1107	jnz	  aes_loop_par_enc\@
   1108
   1109aes_loop_par_enc_done\@:
   1110	MOVADQ	  (%r10), \TMP3
   1111	aesenclast \TMP3, \XMM1           # Round 10
   1112	aesenclast \TMP3, \XMM2
   1113	aesenclast \TMP3, \XMM3
   1114	aesenclast \TMP3, \XMM4
   1115	movdqu    HashKey_k(%arg2), \TMP5
   1116	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
   1117	movdqu	  (%arg4,%r11,1), \TMP3
   1118	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
   1119	movdqu	  16(%arg4,%r11,1), \TMP3
   1120	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
   1121	movdqu	  32(%arg4,%r11,1), \TMP3
   1122	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
   1123	movdqu	  48(%arg4,%r11,1), \TMP3
   1124	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
   1125        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
   1126        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
   1127        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
   1128        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
   1129	pshufb %xmm15, \XMM1        # perform a 16 byte swap
   1130	pshufb %xmm15, \XMM2	# perform a 16 byte swap
   1131	pshufb %xmm15, \XMM3	# perform a 16 byte swap
   1132	pshufb %xmm15, \XMM4	# perform a 16 byte swap
   1133
   1134	pxor	  \TMP4, \TMP1
   1135	pxor	  \XMM8, \XMM5
   1136	pxor	  \TMP6, \TMP2
   1137	pxor	  \TMP1, \TMP2
   1138	pxor	  \XMM5, \TMP2
   1139	movdqa	  \TMP2, \TMP3
   1140	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
   1141	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
   1142	pxor	  \TMP3, \XMM5
   1143	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
   1144
   1145        # first phase of reduction
   1146
   1147	movdqa    \XMM5, \TMP2
   1148	movdqa    \XMM5, \TMP3
   1149	movdqa    \XMM5, \TMP4
   1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
   1151	pslld     $31, \TMP2                   # packed right shift << 31
   1152	pslld     $30, \TMP3                   # packed right shift << 30
   1153	pslld     $25, \TMP4                   # packed right shift << 25
   1154	pxor      \TMP3, \TMP2	               # xor the shifted versions
   1155	pxor      \TMP4, \TMP2
   1156	movdqa    \TMP2, \TMP5
   1157	psrldq    $4, \TMP5                    # right shift T5 1 DW
   1158	pslldq    $12, \TMP2                   # left shift T2 3 DWs
   1159	pxor      \TMP2, \XMM5
   1160
   1161        # second phase of reduction
   1162
   1163	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
   1164	movdqa    \XMM5,\TMP3
   1165	movdqa    \XMM5,\TMP4
   1166	psrld     $1, \TMP2                    # packed left shift >>1
   1167	psrld     $2, \TMP3                    # packed left shift >>2
   1168	psrld     $7, \TMP4                    # packed left shift >>7
   1169	pxor      \TMP3,\TMP2		       # xor the shifted versions
   1170	pxor      \TMP4,\TMP2
   1171	pxor      \TMP5, \TMP2
   1172	pxor      \TMP2, \XMM5
   1173	pxor      \TMP1, \XMM5                 # result is in TMP1
   1174
   1175	pxor	  \XMM5, \XMM1
   1176.endm
   1177
   1178/*
   1179* decrypt 4 blocks at a time
   1180* ghash the 4 previously decrypted ciphertext blocks
   1181* arg1, %arg3, %arg4 are used as pointers only, not modified
   1182* %r11 is the data offset value
   1183*/
   1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
   1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
   1186
   1187	movdqa	  \XMM1, \XMM5
   1188	movdqa	  \XMM2, \XMM6
   1189	movdqa	  \XMM3, \XMM7
   1190	movdqa	  \XMM4, \XMM8
   1191
   1192        movdqa    SHUF_MASK(%rip), %xmm15
   1193        # multiply TMP5 * HashKey using karatsuba
   1194
   1195	movdqa	  \XMM5, \TMP4
   1196	pshufd	  $78, \XMM5, \TMP6
   1197	pxor	  \XMM5, \TMP6
   1198	paddd     ONE(%rip), \XMM0		# INCR CNT
   1199	movdqu	  HashKey_4(%arg2), \TMP5
   1200	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
   1201	movdqa    \XMM0, \XMM1
   1202	paddd     ONE(%rip), \XMM0		# INCR CNT
   1203	movdqa    \XMM0, \XMM2
   1204	paddd     ONE(%rip), \XMM0		# INCR CNT
   1205	movdqa    \XMM0, \XMM3
   1206	paddd     ONE(%rip), \XMM0		# INCR CNT
   1207	movdqa    \XMM0, \XMM4
   1208	pshufb %xmm15, \XMM1	# perform a 16 byte swap
   1209	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
   1210	pshufb %xmm15, \XMM2	# perform a 16 byte swap
   1211	pshufb %xmm15, \XMM3	# perform a 16 byte swap
   1212	pshufb %xmm15, \XMM4	# perform a 16 byte swap
   1213
   1214	pxor	  (%arg1), \XMM1
   1215	pxor	  (%arg1), \XMM2
   1216	pxor	  (%arg1), \XMM3
   1217	pxor	  (%arg1), \XMM4
   1218	movdqu	  HashKey_4_k(%arg2), \TMP5
   1219	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
   1220	movaps 0x10(%arg1), \TMP1
   1221	aesenc	  \TMP1, \XMM1              # Round 1
   1222	aesenc	  \TMP1, \XMM2
   1223	aesenc	  \TMP1, \XMM3
   1224	aesenc	  \TMP1, \XMM4
   1225	movaps 0x20(%arg1), \TMP1
   1226	aesenc	  \TMP1, \XMM1              # Round 2
   1227	aesenc	  \TMP1, \XMM2
   1228	aesenc	  \TMP1, \XMM3
   1229	aesenc	  \TMP1, \XMM4
   1230	movdqa	  \XMM6, \TMP1
   1231	pshufd	  $78, \XMM6, \TMP2
   1232	pxor	  \XMM6, \TMP2
   1233	movdqu	  HashKey_3(%arg2), \TMP5
   1234	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
   1235	movaps 0x30(%arg1), \TMP3
   1236	aesenc    \TMP3, \XMM1              # Round 3
   1237	aesenc    \TMP3, \XMM2
   1238	aesenc    \TMP3, \XMM3
   1239	aesenc    \TMP3, \XMM4
   1240	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
   1241	movaps 0x40(%arg1), \TMP3
   1242	aesenc	  \TMP3, \XMM1              # Round 4
   1243	aesenc	  \TMP3, \XMM2
   1244	aesenc	  \TMP3, \XMM3
   1245	aesenc	  \TMP3, \XMM4
   1246	movdqu	  HashKey_3_k(%arg2), \TMP5
   1247	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1248	movaps 0x50(%arg1), \TMP3
   1249	aesenc	  \TMP3, \XMM1              # Round 5
   1250	aesenc	  \TMP3, \XMM2
   1251	aesenc	  \TMP3, \XMM3
   1252	aesenc	  \TMP3, \XMM4
   1253	pxor	  \TMP1, \TMP4
   1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
   1255	pxor	  \XMM6, \XMM5
   1256	pxor	  \TMP2, \TMP6
   1257	movdqa	  \XMM7, \TMP1
   1258	pshufd	  $78, \XMM7, \TMP2
   1259	pxor	  \XMM7, \TMP2
   1260	movdqu	  HashKey_2(%arg2), \TMP5
   1261
   1262        # Multiply TMP5 * HashKey using karatsuba
   1263
   1264	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
   1265	movaps 0x60(%arg1), \TMP3
   1266	aesenc	  \TMP3, \XMM1              # Round 6
   1267	aesenc	  \TMP3, \XMM2
   1268	aesenc	  \TMP3, \XMM3
   1269	aesenc	  \TMP3, \XMM4
   1270	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
   1271	movaps 0x70(%arg1), \TMP3
   1272	aesenc	  \TMP3, \XMM1              # Round 7
   1273	aesenc	  \TMP3, \XMM2
   1274	aesenc	  \TMP3, \XMM3
   1275	aesenc	  \TMP3, \XMM4
   1276	movdqu	  HashKey_2_k(%arg2), \TMP5
   1277	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1278	movaps 0x80(%arg1), \TMP3
   1279	aesenc	  \TMP3, \XMM1              # Round 8
   1280	aesenc	  \TMP3, \XMM2
   1281	aesenc	  \TMP3, \XMM3
   1282	aesenc	  \TMP3, \XMM4
   1283	pxor	  \TMP1, \TMP4
   1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
   1285	pxor	  \XMM7, \XMM5
   1286	pxor	  \TMP2, \TMP6
   1287
   1288        # Multiply XMM8 * HashKey
   1289        # XMM8 and TMP5 hold the values for the two operands
   1290
   1291	movdqa	  \XMM8, \TMP1
   1292	pshufd	  $78, \XMM8, \TMP2
   1293	pxor	  \XMM8, \TMP2
   1294	movdqu	  HashKey(%arg2), \TMP5
   1295	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
   1296	movaps 0x90(%arg1), \TMP3
   1297	aesenc	  \TMP3, \XMM1             # Round 9
   1298	aesenc	  \TMP3, \XMM2
   1299	aesenc	  \TMP3, \XMM3
   1300	aesenc	  \TMP3, \XMM4
   1301	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
   1302	lea	  0xa0(%arg1),%r10
   1303	mov	  keysize,%eax
   1304	shr	  $2,%eax		        # 128->4, 192->6, 256->8
   1305	sub	  $4,%eax			# 128->0, 192->2, 256->4
   1306	jz	  aes_loop_par_dec_done\@
   1307
   1308aes_loop_par_dec\@:
   1309	MOVADQ	  (%r10),\TMP3
   1310.irpc	index, 1234
   1311	aesenc	  \TMP3, %xmm\index
   1312.endr
   1313	add	  $16,%r10
   1314	sub	  $1,%eax
   1315	jnz	  aes_loop_par_dec\@
   1316
   1317aes_loop_par_dec_done\@:
   1318	MOVADQ	  (%r10), \TMP3
   1319	aesenclast \TMP3, \XMM1           # last round
   1320	aesenclast \TMP3, \XMM2
   1321	aesenclast \TMP3, \XMM3
   1322	aesenclast \TMP3, \XMM4
   1323	movdqu    HashKey_k(%arg2), \TMP5
   1324	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
   1325	movdqu	  (%arg4,%r11,1), \TMP3
   1326	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
   1327	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
   1328	movdqa    \TMP3, \XMM1
   1329	movdqu	  16(%arg4,%r11,1), \TMP3
   1330	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
   1331	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
   1332	movdqa    \TMP3, \XMM2
   1333	movdqu	  32(%arg4,%r11,1), \TMP3
   1334	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
   1335	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
   1336	movdqa    \TMP3, \XMM3
   1337	movdqu	  48(%arg4,%r11,1), \TMP3
   1338	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
   1339	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
   1340	movdqa    \TMP3, \XMM4
   1341	pshufb %xmm15, \XMM1        # perform a 16 byte swap
   1342	pshufb %xmm15, \XMM2	# perform a 16 byte swap
   1343	pshufb %xmm15, \XMM3	# perform a 16 byte swap
   1344	pshufb %xmm15, \XMM4	# perform a 16 byte swap
   1345
   1346	pxor	  \TMP4, \TMP1
   1347	pxor	  \XMM8, \XMM5
   1348	pxor	  \TMP6, \TMP2
   1349	pxor	  \TMP1, \TMP2
   1350	pxor	  \XMM5, \TMP2
   1351	movdqa	  \TMP2, \TMP3
   1352	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
   1353	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
   1354	pxor	  \TMP3, \XMM5
   1355	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
   1356
   1357        # first phase of reduction
   1358
   1359	movdqa    \XMM5, \TMP2
   1360	movdqa    \XMM5, \TMP3
   1361	movdqa    \XMM5, \TMP4
   1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
   1363	pslld     $31, \TMP2                   # packed right shift << 31
   1364	pslld     $30, \TMP3                   # packed right shift << 30
   1365	pslld     $25, \TMP4                   # packed right shift << 25
   1366	pxor      \TMP3, \TMP2	               # xor the shifted versions
   1367	pxor      \TMP4, \TMP2
   1368	movdqa    \TMP2, \TMP5
   1369	psrldq    $4, \TMP5                    # right shift T5 1 DW
   1370	pslldq    $12, \TMP2                   # left shift T2 3 DWs
   1371	pxor      \TMP2, \XMM5
   1372
   1373        # second phase of reduction
   1374
   1375	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
   1376	movdqa    \XMM5,\TMP3
   1377	movdqa    \XMM5,\TMP4
   1378	psrld     $1, \TMP2                    # packed left shift >>1
   1379	psrld     $2, \TMP3                    # packed left shift >>2
   1380	psrld     $7, \TMP4                    # packed left shift >>7
   1381	pxor      \TMP3,\TMP2		       # xor the shifted versions
   1382	pxor      \TMP4,\TMP2
   1383	pxor      \TMP5, \TMP2
   1384	pxor      \TMP2, \XMM5
   1385	pxor      \TMP1, \XMM5                 # result is in TMP1
   1386
   1387	pxor	  \XMM5, \XMM1
   1388.endm
   1389
   1390/* GHASH the last 4 ciphertext blocks. */
   1391.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
   1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
   1393
   1394        # Multiply TMP6 * HashKey (using Karatsuba)
   1395
   1396	movdqa	  \XMM1, \TMP6
   1397	pshufd	  $78, \XMM1, \TMP2
   1398	pxor	  \XMM1, \TMP2
   1399	movdqu	  HashKey_4(%arg2), \TMP5
   1400	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
   1401	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
   1402	movdqu	  HashKey_4_k(%arg2), \TMP4
   1403	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1404	movdqa	  \XMM1, \XMMDst
   1405	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
   1406
   1407        # Multiply TMP1 * HashKey (using Karatsuba)
   1408
   1409	movdqa	  \XMM2, \TMP1
   1410	pshufd	  $78, \XMM2, \TMP2
   1411	pxor	  \XMM2, \TMP2
   1412	movdqu	  HashKey_3(%arg2), \TMP5
   1413	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
   1414	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
   1415	movdqu	  HashKey_3_k(%arg2), \TMP4
   1416	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1417	pxor	  \TMP1, \TMP6
   1418	pxor	  \XMM2, \XMMDst
   1419	pxor	  \TMP2, \XMM1
   1420# results accumulated in TMP6, XMMDst, XMM1
   1421
   1422        # Multiply TMP1 * HashKey (using Karatsuba)
   1423
   1424	movdqa	  \XMM3, \TMP1
   1425	pshufd	  $78, \XMM3, \TMP2
   1426	pxor	  \XMM3, \TMP2
   1427	movdqu	  HashKey_2(%arg2), \TMP5
   1428	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
   1429	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
   1430	movdqu	  HashKey_2_k(%arg2), \TMP4
   1431	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1432	pxor	  \TMP1, \TMP6
   1433	pxor	  \XMM3, \XMMDst
   1434	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
   1435
   1436        # Multiply TMP1 * HashKey (using Karatsuba)
   1437	movdqa	  \XMM4, \TMP1
   1438	pshufd	  $78, \XMM4, \TMP2
   1439	pxor	  \XMM4, \TMP2
   1440	movdqu	  HashKey(%arg2), \TMP5
   1441	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
   1442	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
   1443	movdqu	  HashKey_k(%arg2), \TMP4
   1444	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
   1445	pxor	  \TMP1, \TMP6
   1446	pxor	  \XMM4, \XMMDst
   1447	pxor	  \XMM1, \TMP2
   1448	pxor	  \TMP6, \TMP2
   1449	pxor	  \XMMDst, \TMP2
   1450	# middle section of the temp results combined as in karatsuba algorithm
   1451	movdqa	  \TMP2, \TMP4
   1452	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
   1453	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
   1454	pxor	  \TMP4, \XMMDst
   1455	pxor	  \TMP2, \TMP6
   1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
   1457	# first phase of the reduction
   1458	movdqa    \XMMDst, \TMP2
   1459	movdqa    \XMMDst, \TMP3
   1460	movdqa    \XMMDst, \TMP4
   1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
   1462	pslld     $31, \TMP2                # packed right shifting << 31
   1463	pslld     $30, \TMP3                # packed right shifting << 30
   1464	pslld     $25, \TMP4                # packed right shifting << 25
   1465	pxor      \TMP3, \TMP2              # xor the shifted versions
   1466	pxor      \TMP4, \TMP2
   1467	movdqa    \TMP2, \TMP7
   1468	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
   1469	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
   1470	pxor      \TMP2, \XMMDst
   1471
   1472        # second phase of the reduction
   1473	movdqa    \XMMDst, \TMP2
   1474	# make 3 copies of XMMDst for doing 3 shift operations
   1475	movdqa    \XMMDst, \TMP3
   1476	movdqa    \XMMDst, \TMP4
   1477	psrld     $1, \TMP2                 # packed left shift >> 1
   1478	psrld     $2, \TMP3                 # packed left shift >> 2
   1479	psrld     $7, \TMP4                 # packed left shift >> 7
   1480	pxor      \TMP3, \TMP2              # xor the shifted versions
   1481	pxor      \TMP4, \TMP2
   1482	pxor      \TMP7, \TMP2
   1483	pxor      \TMP2, \XMMDst
   1484	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
   1485.endm
   1486
   1487
   1488/* Encryption of a single block
   1489* uses eax & r10
   1490*/
   1491
   1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
   1493
   1494	pxor		(%arg1), \XMM0
   1495	mov		keysize,%eax
   1496	shr		$2,%eax			# 128->4, 192->6, 256->8
   1497	add		$5,%eax			# 128->9, 192->11, 256->13
   1498	lea		16(%arg1), %r10	  # get first expanded key address
   1499
   1500_esb_loop_\@:
   1501	MOVADQ		(%r10),\TMP1
   1502	aesenc		\TMP1,\XMM0
   1503	add		$16,%r10
   1504	sub		$1,%eax
   1505	jnz		_esb_loop_\@
   1506
   1507	MOVADQ		(%r10),\TMP1
   1508	aesenclast	\TMP1,\XMM0
   1509.endm
   1510/*****************************************************************************
   1511* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
   1512*                   struct gcm_context_data *data
   1513*                                      // Context data
   1514*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
   1515*                   const u8 *in,      // Ciphertext input
   1516*                   u64 plaintext_len, // Length of data in bytes for decryption.
   1517*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
   1518*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
   1519*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
   1520*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
   1521*                   const u8 *aad,     // Additional Authentication Data (AAD)
   1522*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
   1523*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
   1524*                                      // given authentication tag and only return the plaintext if they match.
   1525*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
   1526*                                      // (most likely), 12 or 8.
   1527*
   1528* Assumptions:
   1529*
   1530* keys:
   1531*       keys are pre-expanded and aligned to 16 bytes. we are using the first
   1532*       set of 11 keys in the data structure void *aes_ctx
   1533*
   1534* iv:
   1535*       0                   1                   2                   3
   1536*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1537*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1538*       |                             Salt  (From the SA)               |
   1539*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1540*       |                     Initialization Vector                     |
   1541*       |         (This is the sequence number from IPSec header)       |
   1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1543*       |                              0x1                              |
   1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1545*
   1546*
   1547*
   1548* AAD:
   1549*       AAD padded to 128 bits with 0
   1550*       for example, assume AAD is a u32 vector
   1551*
   1552*       if AAD is 8 bytes:
   1553*       AAD[3] = {A0, A1};
   1554*       padded AAD in xmm register = {A1 A0 0 0}
   1555*
   1556*       0                   1                   2                   3
   1557*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1558*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1559*       |                               SPI (A1)                        |
   1560*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1561*       |                     32-bit Sequence Number (A0)               |
   1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1563*       |                              0x0                              |
   1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1565*
   1566*                                       AAD Format with 32-bit Sequence Number
   1567*
   1568*       if AAD is 12 bytes:
   1569*       AAD[3] = {A0, A1, A2};
   1570*       padded AAD in xmm register = {A2 A1 A0 0}
   1571*
   1572*       0                   1                   2                   3
   1573*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1574*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1575*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1576*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1577*       |                               SPI (A2)                        |
   1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1579*       |                 64-bit Extended Sequence Number {A1,A0}       |
   1580*       |                                                               |
   1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1582*       |                              0x0                              |
   1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1584*
   1585*                        AAD Format with 64-bit Extended Sequence Number
   1586*
   1587* poly = x^128 + x^127 + x^126 + x^121 + 1
   1588*
   1589*****************************************************************************/
   1590SYM_FUNC_START(aesni_gcm_dec)
   1591	FUNC_SAVE
   1592
   1593	GCM_INIT %arg6, arg7, arg8, arg9
   1594	GCM_ENC_DEC dec
   1595	GCM_COMPLETE arg10, arg11
   1596	FUNC_RESTORE
   1597	RET
   1598SYM_FUNC_END(aesni_gcm_dec)
   1599
   1600
   1601/*****************************************************************************
   1602* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
   1603*                    struct gcm_context_data *data
   1604*                                        // Context data
   1605*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
   1606*                    const u8 *in,       // Plaintext input
   1607*                    u64 plaintext_len,  // Length of data in bytes for encryption.
   1608*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
   1609*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
   1610*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
   1611*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
   1612*                    const u8 *aad,      // Additional Authentication Data (AAD)
   1613*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
   1614*                    u8 *auth_tag,       // Authenticated Tag output.
   1615*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
   1616*                                        // 12 or 8.
   1617*
   1618* Assumptions:
   1619*
   1620* keys:
   1621*       keys are pre-expanded and aligned to 16 bytes. we are using the
   1622*       first set of 11 keys in the data structure void *aes_ctx
   1623*
   1624*
   1625* iv:
   1626*       0                   1                   2                   3
   1627*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1628*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1629*       |                             Salt  (From the SA)               |
   1630*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1631*       |                     Initialization Vector                     |
   1632*       |         (This is the sequence number from IPSec header)       |
   1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1634*       |                              0x1                              |
   1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1636*
   1637*
   1638*
   1639* AAD:
   1640*       AAD padded to 128 bits with 0
   1641*       for example, assume AAD is a u32 vector
   1642*
   1643*       if AAD is 8 bytes:
   1644*       AAD[3] = {A0, A1};
   1645*       padded AAD in xmm register = {A1 A0 0 0}
   1646*
   1647*       0                   1                   2                   3
   1648*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1649*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1650*       |                               SPI (A1)                        |
   1651*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1652*       |                     32-bit Sequence Number (A0)               |
   1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1654*       |                              0x0                              |
   1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1656*
   1657*                                 AAD Format with 32-bit Sequence Number
   1658*
   1659*       if AAD is 12 bytes:
   1660*       AAD[3] = {A0, A1, A2};
   1661*       padded AAD in xmm register = {A2 A1 A0 0}
   1662*
   1663*       0                   1                   2                   3
   1664*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   1665*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1666*       |                               SPI (A2)                        |
   1667*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1668*       |                 64-bit Extended Sequence Number {A1,A0}       |
   1669*       |                                                               |
   1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1671*       |                              0x0                              |
   1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   1673*
   1674*                         AAD Format with 64-bit Extended Sequence Number
   1675*
   1676* poly = x^128 + x^127 + x^126 + x^121 + 1
   1677***************************************************************************/
   1678SYM_FUNC_START(aesni_gcm_enc)
   1679	FUNC_SAVE
   1680
   1681	GCM_INIT %arg6, arg7, arg8, arg9
   1682	GCM_ENC_DEC enc
   1683
   1684	GCM_COMPLETE arg10, arg11
   1685	FUNC_RESTORE
   1686	RET
   1687SYM_FUNC_END(aesni_gcm_enc)
   1688
   1689/*****************************************************************************
   1690* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
   1691*                     struct gcm_context_data *data,
   1692*                                         // context data
   1693*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
   1694*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
   1695*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
   1696*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
   1697*                     const u8 *aad,      // Additional Authentication Data (AAD)
   1698*                     u64 aad_len)        // Length of AAD in bytes.
   1699*/
   1700SYM_FUNC_START(aesni_gcm_init)
   1701	FUNC_SAVE
   1702	GCM_INIT %arg3, %arg4,%arg5, %arg6
   1703	FUNC_RESTORE
   1704	RET
   1705SYM_FUNC_END(aesni_gcm_init)
   1706
   1707/*****************************************************************************
   1708* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
   1709*                    struct gcm_context_data *data,
   1710*                                        // context data
   1711*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
   1712*                    const u8 *in,       // Plaintext input
   1713*                    u64 plaintext_len,  // Length of data in bytes for encryption.
   1714*/
   1715SYM_FUNC_START(aesni_gcm_enc_update)
   1716	FUNC_SAVE
   1717	GCM_ENC_DEC enc
   1718	FUNC_RESTORE
   1719	RET
   1720SYM_FUNC_END(aesni_gcm_enc_update)
   1721
   1722/*****************************************************************************
   1723* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
   1724*                    struct gcm_context_data *data,
   1725*                                        // context data
   1726*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
   1727*                    const u8 *in,       // Plaintext input
   1728*                    u64 plaintext_len,  // Length of data in bytes for encryption.
   1729*/
   1730SYM_FUNC_START(aesni_gcm_dec_update)
   1731	FUNC_SAVE
   1732	GCM_ENC_DEC dec
   1733	FUNC_RESTORE
   1734	RET
   1735SYM_FUNC_END(aesni_gcm_dec_update)
   1736
   1737/*****************************************************************************
   1738* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
   1739*                    struct gcm_context_data *data,
   1740*                                        // context data
   1741*                    u8 *auth_tag,       // Authenticated Tag output.
   1742*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
   1743*                                        // 12 or 8.
   1744*/
   1745SYM_FUNC_START(aesni_gcm_finalize)
   1746	FUNC_SAVE
   1747	GCM_COMPLETE %arg3 %arg4
   1748	FUNC_RESTORE
   1749	RET
   1750SYM_FUNC_END(aesni_gcm_finalize)
   1751
   1752#endif
   1753
   1754SYM_FUNC_START_LOCAL(_key_expansion_256a)
   1755	pshufd $0b11111111, %xmm1, %xmm1
   1756	shufps $0b00010000, %xmm0, %xmm4
   1757	pxor %xmm4, %xmm0
   1758	shufps $0b10001100, %xmm0, %xmm4
   1759	pxor %xmm4, %xmm0
   1760	pxor %xmm1, %xmm0
   1761	movaps %xmm0, (TKEYP)
   1762	add $0x10, TKEYP
   1763	RET
   1764SYM_FUNC_END(_key_expansion_256a)
   1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
   1766
   1767SYM_FUNC_START_LOCAL(_key_expansion_192a)
   1768	pshufd $0b01010101, %xmm1, %xmm1
   1769	shufps $0b00010000, %xmm0, %xmm4
   1770	pxor %xmm4, %xmm0
   1771	shufps $0b10001100, %xmm0, %xmm4
   1772	pxor %xmm4, %xmm0
   1773	pxor %xmm1, %xmm0
   1774
   1775	movaps %xmm2, %xmm5
   1776	movaps %xmm2, %xmm6
   1777	pslldq $4, %xmm5
   1778	pshufd $0b11111111, %xmm0, %xmm3
   1779	pxor %xmm3, %xmm2
   1780	pxor %xmm5, %xmm2
   1781
   1782	movaps %xmm0, %xmm1
   1783	shufps $0b01000100, %xmm0, %xmm6
   1784	movaps %xmm6, (TKEYP)
   1785	shufps $0b01001110, %xmm2, %xmm1
   1786	movaps %xmm1, 0x10(TKEYP)
   1787	add $0x20, TKEYP
   1788	RET
   1789SYM_FUNC_END(_key_expansion_192a)
   1790
   1791SYM_FUNC_START_LOCAL(_key_expansion_192b)
   1792	pshufd $0b01010101, %xmm1, %xmm1
   1793	shufps $0b00010000, %xmm0, %xmm4
   1794	pxor %xmm4, %xmm0
   1795	shufps $0b10001100, %xmm0, %xmm4
   1796	pxor %xmm4, %xmm0
   1797	pxor %xmm1, %xmm0
   1798
   1799	movaps %xmm2, %xmm5
   1800	pslldq $4, %xmm5
   1801	pshufd $0b11111111, %xmm0, %xmm3
   1802	pxor %xmm3, %xmm2
   1803	pxor %xmm5, %xmm2
   1804
   1805	movaps %xmm0, (TKEYP)
   1806	add $0x10, TKEYP
   1807	RET
   1808SYM_FUNC_END(_key_expansion_192b)
   1809
   1810SYM_FUNC_START_LOCAL(_key_expansion_256b)
   1811	pshufd $0b10101010, %xmm1, %xmm1
   1812	shufps $0b00010000, %xmm2, %xmm4
   1813	pxor %xmm4, %xmm2
   1814	shufps $0b10001100, %xmm2, %xmm4
   1815	pxor %xmm4, %xmm2
   1816	pxor %xmm1, %xmm2
   1817	movaps %xmm2, (TKEYP)
   1818	add $0x10, TKEYP
   1819	RET
   1820SYM_FUNC_END(_key_expansion_256b)
   1821
   1822/*
   1823 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
   1824 *                   unsigned int key_len)
   1825 */
   1826SYM_FUNC_START(aesni_set_key)
   1827	FRAME_BEGIN
   1828#ifndef __x86_64__
   1829	pushl KEYP
   1830	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
   1831	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
   1832	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
   1833#endif
   1834	movups (UKEYP), %xmm0		# user key (first 16 bytes)
   1835	movaps %xmm0, (KEYP)
   1836	lea 0x10(KEYP), TKEYP		# key addr
   1837	movl %edx, 480(KEYP)
   1838	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
   1839	cmp $24, %dl
   1840	jb .Lenc_key128
   1841	je .Lenc_key192
   1842	movups 0x10(UKEYP), %xmm2	# other user key
   1843	movaps %xmm2, (TKEYP)
   1844	add $0x10, TKEYP
   1845	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
   1846	call _key_expansion_256a
   1847	aeskeygenassist $0x1, %xmm0, %xmm1
   1848	call _key_expansion_256b
   1849	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
   1850	call _key_expansion_256a
   1851	aeskeygenassist $0x2, %xmm0, %xmm1
   1852	call _key_expansion_256b
   1853	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
   1854	call _key_expansion_256a
   1855	aeskeygenassist $0x4, %xmm0, %xmm1
   1856	call _key_expansion_256b
   1857	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
   1858	call _key_expansion_256a
   1859	aeskeygenassist $0x8, %xmm0, %xmm1
   1860	call _key_expansion_256b
   1861	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
   1862	call _key_expansion_256a
   1863	aeskeygenassist $0x10, %xmm0, %xmm1
   1864	call _key_expansion_256b
   1865	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
   1866	call _key_expansion_256a
   1867	aeskeygenassist $0x20, %xmm0, %xmm1
   1868	call _key_expansion_256b
   1869	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
   1870	call _key_expansion_256a
   1871	jmp .Ldec_key
   1872.Lenc_key192:
   1873	movq 0x10(UKEYP), %xmm2		# other user key
   1874	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
   1875	call _key_expansion_192a
   1876	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
   1877	call _key_expansion_192b
   1878	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
   1879	call _key_expansion_192a
   1880	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
   1881	call _key_expansion_192b
   1882	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
   1883	call _key_expansion_192a
   1884	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
   1885	call _key_expansion_192b
   1886	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
   1887	call _key_expansion_192a
   1888	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
   1889	call _key_expansion_192b
   1890	jmp .Ldec_key
   1891.Lenc_key128:
   1892	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
   1893	call _key_expansion_128
   1894	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
   1895	call _key_expansion_128
   1896	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
   1897	call _key_expansion_128
   1898	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
   1899	call _key_expansion_128
   1900	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
   1901	call _key_expansion_128
   1902	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
   1903	call _key_expansion_128
   1904	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
   1905	call _key_expansion_128
   1906	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
   1907	call _key_expansion_128
   1908	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
   1909	call _key_expansion_128
   1910	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
   1911	call _key_expansion_128
   1912.Ldec_key:
   1913	sub $0x10, TKEYP
   1914	movaps (KEYP), %xmm0
   1915	movaps (TKEYP), %xmm1
   1916	movaps %xmm0, 240(TKEYP)
   1917	movaps %xmm1, 240(KEYP)
   1918	add $0x10, KEYP
   1919	lea 240-16(TKEYP), UKEYP
   1920.align 4
   1921.Ldec_key_loop:
   1922	movaps (KEYP), %xmm0
   1923	aesimc %xmm0, %xmm1
   1924	movaps %xmm1, (UKEYP)
   1925	add $0x10, KEYP
   1926	sub $0x10, UKEYP
   1927	cmp TKEYP, KEYP
   1928	jb .Ldec_key_loop
   1929	xor AREG, AREG
   1930#ifndef __x86_64__
   1931	popl KEYP
   1932#endif
   1933	FRAME_END
   1934	RET
   1935SYM_FUNC_END(aesni_set_key)
   1936
   1937/*
   1938 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
   1939 */
   1940SYM_FUNC_START(aesni_enc)
   1941	FRAME_BEGIN
   1942#ifndef __x86_64__
   1943	pushl KEYP
   1944	pushl KLEN
   1945	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
   1946	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
   1947	movl (FRAME_OFFSET+20)(%esp), INP	# src
   1948#endif
   1949	movl 480(KEYP), KLEN		# key length
   1950	movups (INP), STATE		# input
   1951	call _aesni_enc1
   1952	movups STATE, (OUTP)		# output
   1953#ifndef __x86_64__
   1954	popl KLEN
   1955	popl KEYP
   1956#endif
   1957	FRAME_END
   1958	RET
   1959SYM_FUNC_END(aesni_enc)
   1960
   1961/*
   1962 * _aesni_enc1:		internal ABI
   1963 * input:
   1964 *	KEYP:		key struct pointer
   1965 *	KLEN:		round count
   1966 *	STATE:		initial state (input)
   1967 * output:
   1968 *	STATE:		finial state (output)
   1969 * changed:
   1970 *	KEY
   1971 *	TKEYP (T1)
   1972 */
   1973SYM_FUNC_START_LOCAL(_aesni_enc1)
   1974	movaps (KEYP), KEY		# key
   1975	mov KEYP, TKEYP
   1976	pxor KEY, STATE		# round 0
   1977	add $0x30, TKEYP
   1978	cmp $24, KLEN
   1979	jb .Lenc128
   1980	lea 0x20(TKEYP), TKEYP
   1981	je .Lenc192
   1982	add $0x20, TKEYP
   1983	movaps -0x60(TKEYP), KEY
   1984	aesenc KEY, STATE
   1985	movaps -0x50(TKEYP), KEY
   1986	aesenc KEY, STATE
   1987.align 4
   1988.Lenc192:
   1989	movaps -0x40(TKEYP), KEY
   1990	aesenc KEY, STATE
   1991	movaps -0x30(TKEYP), KEY
   1992	aesenc KEY, STATE
   1993.align 4
   1994.Lenc128:
   1995	movaps -0x20(TKEYP), KEY
   1996	aesenc KEY, STATE
   1997	movaps -0x10(TKEYP), KEY
   1998	aesenc KEY, STATE
   1999	movaps (TKEYP), KEY
   2000	aesenc KEY, STATE
   2001	movaps 0x10(TKEYP), KEY
   2002	aesenc KEY, STATE
   2003	movaps 0x20(TKEYP), KEY
   2004	aesenc KEY, STATE
   2005	movaps 0x30(TKEYP), KEY
   2006	aesenc KEY, STATE
   2007	movaps 0x40(TKEYP), KEY
   2008	aesenc KEY, STATE
   2009	movaps 0x50(TKEYP), KEY
   2010	aesenc KEY, STATE
   2011	movaps 0x60(TKEYP), KEY
   2012	aesenc KEY, STATE
   2013	movaps 0x70(TKEYP), KEY
   2014	aesenclast KEY, STATE
   2015	RET
   2016SYM_FUNC_END(_aesni_enc1)
   2017
   2018/*
   2019 * _aesni_enc4:	internal ABI
   2020 * input:
   2021 *	KEYP:		key struct pointer
   2022 *	KLEN:		round count
   2023 *	STATE1:		initial state (input)
   2024 *	STATE2
   2025 *	STATE3
   2026 *	STATE4
   2027 * output:
   2028 *	STATE1:		finial state (output)
   2029 *	STATE2
   2030 *	STATE3
   2031 *	STATE4
   2032 * changed:
   2033 *	KEY
   2034 *	TKEYP (T1)
   2035 */
   2036SYM_FUNC_START_LOCAL(_aesni_enc4)
   2037	movaps (KEYP), KEY		# key
   2038	mov KEYP, TKEYP
   2039	pxor KEY, STATE1		# round 0
   2040	pxor KEY, STATE2
   2041	pxor KEY, STATE3
   2042	pxor KEY, STATE4
   2043	add $0x30, TKEYP
   2044	cmp $24, KLEN
   2045	jb .L4enc128
   2046	lea 0x20(TKEYP), TKEYP
   2047	je .L4enc192
   2048	add $0x20, TKEYP
   2049	movaps -0x60(TKEYP), KEY
   2050	aesenc KEY, STATE1
   2051	aesenc KEY, STATE2
   2052	aesenc KEY, STATE3
   2053	aesenc KEY, STATE4
   2054	movaps -0x50(TKEYP), KEY
   2055	aesenc KEY, STATE1
   2056	aesenc KEY, STATE2
   2057	aesenc KEY, STATE3
   2058	aesenc KEY, STATE4
   2059#.align 4
   2060.L4enc192:
   2061	movaps -0x40(TKEYP), KEY
   2062	aesenc KEY, STATE1
   2063	aesenc KEY, STATE2
   2064	aesenc KEY, STATE3
   2065	aesenc KEY, STATE4
   2066	movaps -0x30(TKEYP), KEY
   2067	aesenc KEY, STATE1
   2068	aesenc KEY, STATE2
   2069	aesenc KEY, STATE3
   2070	aesenc KEY, STATE4
   2071#.align 4
   2072.L4enc128:
   2073	movaps -0x20(TKEYP), KEY
   2074	aesenc KEY, STATE1
   2075	aesenc KEY, STATE2
   2076	aesenc KEY, STATE3
   2077	aesenc KEY, STATE4
   2078	movaps -0x10(TKEYP), KEY
   2079	aesenc KEY, STATE1
   2080	aesenc KEY, STATE2
   2081	aesenc KEY, STATE3
   2082	aesenc KEY, STATE4
   2083	movaps (TKEYP), KEY
   2084	aesenc KEY, STATE1
   2085	aesenc KEY, STATE2
   2086	aesenc KEY, STATE3
   2087	aesenc KEY, STATE4
   2088	movaps 0x10(TKEYP), KEY
   2089	aesenc KEY, STATE1
   2090	aesenc KEY, STATE2
   2091	aesenc KEY, STATE3
   2092	aesenc KEY, STATE4
   2093	movaps 0x20(TKEYP), KEY
   2094	aesenc KEY, STATE1
   2095	aesenc KEY, STATE2
   2096	aesenc KEY, STATE3
   2097	aesenc KEY, STATE4
   2098	movaps 0x30(TKEYP), KEY
   2099	aesenc KEY, STATE1
   2100	aesenc KEY, STATE2
   2101	aesenc KEY, STATE3
   2102	aesenc KEY, STATE4
   2103	movaps 0x40(TKEYP), KEY
   2104	aesenc KEY, STATE1
   2105	aesenc KEY, STATE2
   2106	aesenc KEY, STATE3
   2107	aesenc KEY, STATE4
   2108	movaps 0x50(TKEYP), KEY
   2109	aesenc KEY, STATE1
   2110	aesenc KEY, STATE2
   2111	aesenc KEY, STATE3
   2112	aesenc KEY, STATE4
   2113	movaps 0x60(TKEYP), KEY
   2114	aesenc KEY, STATE1
   2115	aesenc KEY, STATE2
   2116	aesenc KEY, STATE3
   2117	aesenc KEY, STATE4
   2118	movaps 0x70(TKEYP), KEY
   2119	aesenclast KEY, STATE1		# last round
   2120	aesenclast KEY, STATE2
   2121	aesenclast KEY, STATE3
   2122	aesenclast KEY, STATE4
   2123	RET
   2124SYM_FUNC_END(_aesni_enc4)
   2125
   2126/*
   2127 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
   2128 */
   2129SYM_FUNC_START(aesni_dec)
   2130	FRAME_BEGIN
   2131#ifndef __x86_64__
   2132	pushl KEYP
   2133	pushl KLEN
   2134	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
   2135	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
   2136	movl (FRAME_OFFSET+20)(%esp), INP	# src
   2137#endif
   2138	mov 480(KEYP), KLEN		# key length
   2139	add $240, KEYP
   2140	movups (INP), STATE		# input
   2141	call _aesni_dec1
   2142	movups STATE, (OUTP)		#output
   2143#ifndef __x86_64__
   2144	popl KLEN
   2145	popl KEYP
   2146#endif
   2147	FRAME_END
   2148	RET
   2149SYM_FUNC_END(aesni_dec)
   2150
   2151/*
   2152 * _aesni_dec1:		internal ABI
   2153 * input:
   2154 *	KEYP:		key struct pointer
   2155 *	KLEN:		key length
   2156 *	STATE:		initial state (input)
   2157 * output:
   2158 *	STATE:		finial state (output)
   2159 * changed:
   2160 *	KEY
   2161 *	TKEYP (T1)
   2162 */
   2163SYM_FUNC_START_LOCAL(_aesni_dec1)
   2164	movaps (KEYP), KEY		# key
   2165	mov KEYP, TKEYP
   2166	pxor KEY, STATE		# round 0
   2167	add $0x30, TKEYP
   2168	cmp $24, KLEN
   2169	jb .Ldec128
   2170	lea 0x20(TKEYP), TKEYP
   2171	je .Ldec192
   2172	add $0x20, TKEYP
   2173	movaps -0x60(TKEYP), KEY
   2174	aesdec KEY, STATE
   2175	movaps -0x50(TKEYP), KEY
   2176	aesdec KEY, STATE
   2177.align 4
   2178.Ldec192:
   2179	movaps -0x40(TKEYP), KEY
   2180	aesdec KEY, STATE
   2181	movaps -0x30(TKEYP), KEY
   2182	aesdec KEY, STATE
   2183.align 4
   2184.Ldec128:
   2185	movaps -0x20(TKEYP), KEY
   2186	aesdec KEY, STATE
   2187	movaps -0x10(TKEYP), KEY
   2188	aesdec KEY, STATE
   2189	movaps (TKEYP), KEY
   2190	aesdec KEY, STATE
   2191	movaps 0x10(TKEYP), KEY
   2192	aesdec KEY, STATE
   2193	movaps 0x20(TKEYP), KEY
   2194	aesdec KEY, STATE
   2195	movaps 0x30(TKEYP), KEY
   2196	aesdec KEY, STATE
   2197	movaps 0x40(TKEYP), KEY
   2198	aesdec KEY, STATE
   2199	movaps 0x50(TKEYP), KEY
   2200	aesdec KEY, STATE
   2201	movaps 0x60(TKEYP), KEY
   2202	aesdec KEY, STATE
   2203	movaps 0x70(TKEYP), KEY
   2204	aesdeclast KEY, STATE
   2205	RET
   2206SYM_FUNC_END(_aesni_dec1)
   2207
   2208/*
   2209 * _aesni_dec4:	internal ABI
   2210 * input:
   2211 *	KEYP:		key struct pointer
   2212 *	KLEN:		key length
   2213 *	STATE1:		initial state (input)
   2214 *	STATE2
   2215 *	STATE3
   2216 *	STATE4
   2217 * output:
   2218 *	STATE1:		finial state (output)
   2219 *	STATE2
   2220 *	STATE3
   2221 *	STATE4
   2222 * changed:
   2223 *	KEY
   2224 *	TKEYP (T1)
   2225 */
   2226SYM_FUNC_START_LOCAL(_aesni_dec4)
   2227	movaps (KEYP), KEY		# key
   2228	mov KEYP, TKEYP
   2229	pxor KEY, STATE1		# round 0
   2230	pxor KEY, STATE2
   2231	pxor KEY, STATE3
   2232	pxor KEY, STATE4
   2233	add $0x30, TKEYP
   2234	cmp $24, KLEN
   2235	jb .L4dec128
   2236	lea 0x20(TKEYP), TKEYP
   2237	je .L4dec192
   2238	add $0x20, TKEYP
   2239	movaps -0x60(TKEYP), KEY
   2240	aesdec KEY, STATE1
   2241	aesdec KEY, STATE2
   2242	aesdec KEY, STATE3
   2243	aesdec KEY, STATE4
   2244	movaps -0x50(TKEYP), KEY
   2245	aesdec KEY, STATE1
   2246	aesdec KEY, STATE2
   2247	aesdec KEY, STATE3
   2248	aesdec KEY, STATE4
   2249.align 4
   2250.L4dec192:
   2251	movaps -0x40(TKEYP), KEY
   2252	aesdec KEY, STATE1
   2253	aesdec KEY, STATE2
   2254	aesdec KEY, STATE3
   2255	aesdec KEY, STATE4
   2256	movaps -0x30(TKEYP), KEY
   2257	aesdec KEY, STATE1
   2258	aesdec KEY, STATE2
   2259	aesdec KEY, STATE3
   2260	aesdec KEY, STATE4
   2261.align 4
   2262.L4dec128:
   2263	movaps -0x20(TKEYP), KEY
   2264	aesdec KEY, STATE1
   2265	aesdec KEY, STATE2
   2266	aesdec KEY, STATE3
   2267	aesdec KEY, STATE4
   2268	movaps -0x10(TKEYP), KEY
   2269	aesdec KEY, STATE1
   2270	aesdec KEY, STATE2
   2271	aesdec KEY, STATE3
   2272	aesdec KEY, STATE4
   2273	movaps (TKEYP), KEY
   2274	aesdec KEY, STATE1
   2275	aesdec KEY, STATE2
   2276	aesdec KEY, STATE3
   2277	aesdec KEY, STATE4
   2278	movaps 0x10(TKEYP), KEY
   2279	aesdec KEY, STATE1
   2280	aesdec KEY, STATE2
   2281	aesdec KEY, STATE3
   2282	aesdec KEY, STATE4
   2283	movaps 0x20(TKEYP), KEY
   2284	aesdec KEY, STATE1
   2285	aesdec KEY, STATE2
   2286	aesdec KEY, STATE3
   2287	aesdec KEY, STATE4
   2288	movaps 0x30(TKEYP), KEY
   2289	aesdec KEY, STATE1
   2290	aesdec KEY, STATE2
   2291	aesdec KEY, STATE3
   2292	aesdec KEY, STATE4
   2293	movaps 0x40(TKEYP), KEY
   2294	aesdec KEY, STATE1
   2295	aesdec KEY, STATE2
   2296	aesdec KEY, STATE3
   2297	aesdec KEY, STATE4
   2298	movaps 0x50(TKEYP), KEY
   2299	aesdec KEY, STATE1
   2300	aesdec KEY, STATE2
   2301	aesdec KEY, STATE3
   2302	aesdec KEY, STATE4
   2303	movaps 0x60(TKEYP), KEY
   2304	aesdec KEY, STATE1
   2305	aesdec KEY, STATE2
   2306	aesdec KEY, STATE3
   2307	aesdec KEY, STATE4
   2308	movaps 0x70(TKEYP), KEY
   2309	aesdeclast KEY, STATE1		# last round
   2310	aesdeclast KEY, STATE2
   2311	aesdeclast KEY, STATE3
   2312	aesdeclast KEY, STATE4
   2313	RET
   2314SYM_FUNC_END(_aesni_dec4)
   2315
   2316/*
   2317 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2318 *		      size_t len)
   2319 */
   2320SYM_FUNC_START(aesni_ecb_enc)
   2321	FRAME_BEGIN
   2322#ifndef __x86_64__
   2323	pushl LEN
   2324	pushl KEYP
   2325	pushl KLEN
   2326	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
   2327	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
   2328	movl (FRAME_OFFSET+24)(%esp), INP	# src
   2329	movl (FRAME_OFFSET+28)(%esp), LEN	# len
   2330#endif
   2331	test LEN, LEN		# check length
   2332	jz .Lecb_enc_ret
   2333	mov 480(KEYP), KLEN
   2334	cmp $16, LEN
   2335	jb .Lecb_enc_ret
   2336	cmp $64, LEN
   2337	jb .Lecb_enc_loop1
   2338.align 4
   2339.Lecb_enc_loop4:
   2340	movups (INP), STATE1
   2341	movups 0x10(INP), STATE2
   2342	movups 0x20(INP), STATE3
   2343	movups 0x30(INP), STATE4
   2344	call _aesni_enc4
   2345	movups STATE1, (OUTP)
   2346	movups STATE2, 0x10(OUTP)
   2347	movups STATE3, 0x20(OUTP)
   2348	movups STATE4, 0x30(OUTP)
   2349	sub $64, LEN
   2350	add $64, INP
   2351	add $64, OUTP
   2352	cmp $64, LEN
   2353	jge .Lecb_enc_loop4
   2354	cmp $16, LEN
   2355	jb .Lecb_enc_ret
   2356.align 4
   2357.Lecb_enc_loop1:
   2358	movups (INP), STATE1
   2359	call _aesni_enc1
   2360	movups STATE1, (OUTP)
   2361	sub $16, LEN
   2362	add $16, INP
   2363	add $16, OUTP
   2364	cmp $16, LEN
   2365	jge .Lecb_enc_loop1
   2366.Lecb_enc_ret:
   2367#ifndef __x86_64__
   2368	popl KLEN
   2369	popl KEYP
   2370	popl LEN
   2371#endif
   2372	FRAME_END
   2373	RET
   2374SYM_FUNC_END(aesni_ecb_enc)
   2375
   2376/*
   2377 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2378 *		      size_t len);
   2379 */
   2380SYM_FUNC_START(aesni_ecb_dec)
   2381	FRAME_BEGIN
   2382#ifndef __x86_64__
   2383	pushl LEN
   2384	pushl KEYP
   2385	pushl KLEN
   2386	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
   2387	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
   2388	movl (FRAME_OFFSET+24)(%esp), INP	# src
   2389	movl (FRAME_OFFSET+28)(%esp), LEN	# len
   2390#endif
   2391	test LEN, LEN
   2392	jz .Lecb_dec_ret
   2393	mov 480(KEYP), KLEN
   2394	add $240, KEYP
   2395	cmp $16, LEN
   2396	jb .Lecb_dec_ret
   2397	cmp $64, LEN
   2398	jb .Lecb_dec_loop1
   2399.align 4
   2400.Lecb_dec_loop4:
   2401	movups (INP), STATE1
   2402	movups 0x10(INP), STATE2
   2403	movups 0x20(INP), STATE3
   2404	movups 0x30(INP), STATE4
   2405	call _aesni_dec4
   2406	movups STATE1, (OUTP)
   2407	movups STATE2, 0x10(OUTP)
   2408	movups STATE3, 0x20(OUTP)
   2409	movups STATE4, 0x30(OUTP)
   2410	sub $64, LEN
   2411	add $64, INP
   2412	add $64, OUTP
   2413	cmp $64, LEN
   2414	jge .Lecb_dec_loop4
   2415	cmp $16, LEN
   2416	jb .Lecb_dec_ret
   2417.align 4
   2418.Lecb_dec_loop1:
   2419	movups (INP), STATE1
   2420	call _aesni_dec1
   2421	movups STATE1, (OUTP)
   2422	sub $16, LEN
   2423	add $16, INP
   2424	add $16, OUTP
   2425	cmp $16, LEN
   2426	jge .Lecb_dec_loop1
   2427.Lecb_dec_ret:
   2428#ifndef __x86_64__
   2429	popl KLEN
   2430	popl KEYP
   2431	popl LEN
   2432#endif
   2433	FRAME_END
   2434	RET
   2435SYM_FUNC_END(aesni_ecb_dec)
   2436
   2437/*
   2438 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2439 *		      size_t len, u8 *iv)
   2440 */
   2441SYM_FUNC_START(aesni_cbc_enc)
   2442	FRAME_BEGIN
   2443#ifndef __x86_64__
   2444	pushl IVP
   2445	pushl LEN
   2446	pushl KEYP
   2447	pushl KLEN
   2448	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
   2449	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
   2450	movl (FRAME_OFFSET+28)(%esp), INP	# src
   2451	movl (FRAME_OFFSET+32)(%esp), LEN	# len
   2452	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
   2453#endif
   2454	cmp $16, LEN
   2455	jb .Lcbc_enc_ret
   2456	mov 480(KEYP), KLEN
   2457	movups (IVP), STATE	# load iv as initial state
   2458.align 4
   2459.Lcbc_enc_loop:
   2460	movups (INP), IN	# load input
   2461	pxor IN, STATE
   2462	call _aesni_enc1
   2463	movups STATE, (OUTP)	# store output
   2464	sub $16, LEN
   2465	add $16, INP
   2466	add $16, OUTP
   2467	cmp $16, LEN
   2468	jge .Lcbc_enc_loop
   2469	movups STATE, (IVP)
   2470.Lcbc_enc_ret:
   2471#ifndef __x86_64__
   2472	popl KLEN
   2473	popl KEYP
   2474	popl LEN
   2475	popl IVP
   2476#endif
   2477	FRAME_END
   2478	RET
   2479SYM_FUNC_END(aesni_cbc_enc)
   2480
   2481/*
   2482 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2483 *		      size_t len, u8 *iv)
   2484 */
   2485SYM_FUNC_START(aesni_cbc_dec)
   2486	FRAME_BEGIN
   2487#ifndef __x86_64__
   2488	pushl IVP
   2489	pushl LEN
   2490	pushl KEYP
   2491	pushl KLEN
   2492	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
   2493	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
   2494	movl (FRAME_OFFSET+28)(%esp), INP	# src
   2495	movl (FRAME_OFFSET+32)(%esp), LEN	# len
   2496	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
   2497#endif
   2498	cmp $16, LEN
   2499	jb .Lcbc_dec_just_ret
   2500	mov 480(KEYP), KLEN
   2501	add $240, KEYP
   2502	movups (IVP), IV
   2503	cmp $64, LEN
   2504	jb .Lcbc_dec_loop1
   2505.align 4
   2506.Lcbc_dec_loop4:
   2507	movups (INP), IN1
   2508	movaps IN1, STATE1
   2509	movups 0x10(INP), IN2
   2510	movaps IN2, STATE2
   2511#ifdef __x86_64__
   2512	movups 0x20(INP), IN3
   2513	movaps IN3, STATE3
   2514	movups 0x30(INP), IN4
   2515	movaps IN4, STATE4
   2516#else
   2517	movups 0x20(INP), IN1
   2518	movaps IN1, STATE3
   2519	movups 0x30(INP), IN2
   2520	movaps IN2, STATE4
   2521#endif
   2522	call _aesni_dec4
   2523	pxor IV, STATE1
   2524#ifdef __x86_64__
   2525	pxor IN1, STATE2
   2526	pxor IN2, STATE3
   2527	pxor IN3, STATE4
   2528	movaps IN4, IV
   2529#else
   2530	pxor IN1, STATE4
   2531	movaps IN2, IV
   2532	movups (INP), IN1
   2533	pxor IN1, STATE2
   2534	movups 0x10(INP), IN2
   2535	pxor IN2, STATE3
   2536#endif
   2537	movups STATE1, (OUTP)
   2538	movups STATE2, 0x10(OUTP)
   2539	movups STATE3, 0x20(OUTP)
   2540	movups STATE4, 0x30(OUTP)
   2541	sub $64, LEN
   2542	add $64, INP
   2543	add $64, OUTP
   2544	cmp $64, LEN
   2545	jge .Lcbc_dec_loop4
   2546	cmp $16, LEN
   2547	jb .Lcbc_dec_ret
   2548.align 4
   2549.Lcbc_dec_loop1:
   2550	movups (INP), IN
   2551	movaps IN, STATE
   2552	call _aesni_dec1
   2553	pxor IV, STATE
   2554	movups STATE, (OUTP)
   2555	movaps IN, IV
   2556	sub $16, LEN
   2557	add $16, INP
   2558	add $16, OUTP
   2559	cmp $16, LEN
   2560	jge .Lcbc_dec_loop1
   2561.Lcbc_dec_ret:
   2562	movups IV, (IVP)
   2563.Lcbc_dec_just_ret:
   2564#ifndef __x86_64__
   2565	popl KLEN
   2566	popl KEYP
   2567	popl LEN
   2568	popl IVP
   2569#endif
   2570	FRAME_END
   2571	RET
   2572SYM_FUNC_END(aesni_cbc_dec)
   2573
   2574/*
   2575 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2576 *			  size_t len, u8 *iv)
   2577 */
   2578SYM_FUNC_START(aesni_cts_cbc_enc)
   2579	FRAME_BEGIN
   2580#ifndef __x86_64__
   2581	pushl IVP
   2582	pushl LEN
   2583	pushl KEYP
   2584	pushl KLEN
   2585	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
   2586	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
   2587	movl (FRAME_OFFSET+28)(%esp), INP	# src
   2588	movl (FRAME_OFFSET+32)(%esp), LEN	# len
   2589	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
   2590	lea .Lcts_permute_table, T1
   2591#else
   2592	lea .Lcts_permute_table(%rip), T1
   2593#endif
   2594	mov 480(KEYP), KLEN
   2595	movups (IVP), STATE
   2596	sub $16, LEN
   2597	mov T1, IVP
   2598	add $32, IVP
   2599	add LEN, T1
   2600	sub LEN, IVP
   2601	movups (T1), %xmm4
   2602	movups (IVP), %xmm5
   2603
   2604	movups (INP), IN1
   2605	add LEN, INP
   2606	movups (INP), IN2
   2607
   2608	pxor IN1, STATE
   2609	call _aesni_enc1
   2610
   2611	pshufb %xmm5, IN2
   2612	pxor STATE, IN2
   2613	pshufb %xmm4, STATE
   2614	add OUTP, LEN
   2615	movups STATE, (LEN)
   2616
   2617	movaps IN2, STATE
   2618	call _aesni_enc1
   2619	movups STATE, (OUTP)
   2620
   2621#ifndef __x86_64__
   2622	popl KLEN
   2623	popl KEYP
   2624	popl LEN
   2625	popl IVP
   2626#endif
   2627	FRAME_END
   2628	RET
   2629SYM_FUNC_END(aesni_cts_cbc_enc)
   2630
   2631/*
   2632 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2633 *			  size_t len, u8 *iv)
   2634 */
   2635SYM_FUNC_START(aesni_cts_cbc_dec)
   2636	FRAME_BEGIN
   2637#ifndef __x86_64__
   2638	pushl IVP
   2639	pushl LEN
   2640	pushl KEYP
   2641	pushl KLEN
   2642	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
   2643	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
   2644	movl (FRAME_OFFSET+28)(%esp), INP	# src
   2645	movl (FRAME_OFFSET+32)(%esp), LEN	# len
   2646	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
   2647	lea .Lcts_permute_table, T1
   2648#else
   2649	lea .Lcts_permute_table(%rip), T1
   2650#endif
   2651	mov 480(KEYP), KLEN
   2652	add $240, KEYP
   2653	movups (IVP), IV
   2654	sub $16, LEN
   2655	mov T1, IVP
   2656	add $32, IVP
   2657	add LEN, T1
   2658	sub LEN, IVP
   2659	movups (T1), %xmm4
   2660
   2661	movups (INP), STATE
   2662	add LEN, INP
   2663	movups (INP), IN1
   2664
   2665	call _aesni_dec1
   2666	movaps STATE, IN2
   2667	pshufb %xmm4, STATE
   2668	pxor IN1, STATE
   2669
   2670	add OUTP, LEN
   2671	movups STATE, (LEN)
   2672
   2673	movups (IVP), %xmm0
   2674	pshufb %xmm0, IN1
   2675	pblendvb IN2, IN1
   2676	movaps IN1, STATE
   2677	call _aesni_dec1
   2678
   2679	pxor IV, STATE
   2680	movups STATE, (OUTP)
   2681
   2682#ifndef __x86_64__
   2683	popl KLEN
   2684	popl KEYP
   2685	popl LEN
   2686	popl IVP
   2687#endif
   2688	FRAME_END
   2689	RET
   2690SYM_FUNC_END(aesni_cts_cbc_dec)
   2691
   2692.pushsection .rodata
   2693.align 16
   2694.Lcts_permute_table:
   2695	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
   2696	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
   2697	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
   2698	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
   2699	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
   2700	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
   2701#ifdef __x86_64__
   2702.Lbswap_mask:
   2703	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
   2704#endif
   2705.popsection
   2706
   2707#ifdef __x86_64__
   2708/*
   2709 * _aesni_inc_init:	internal ABI
   2710 *	setup registers used by _aesni_inc
   2711 * input:
   2712 *	IV
   2713 * output:
   2714 *	CTR:	== IV, in little endian
   2715 *	TCTR_LOW: == lower qword of CTR
   2716 *	INC:	== 1, in little endian
   2717 *	BSWAP_MASK == endian swapping mask
   2718 */
   2719SYM_FUNC_START_LOCAL(_aesni_inc_init)
   2720	movaps .Lbswap_mask, BSWAP_MASK
   2721	movaps IV, CTR
   2722	pshufb BSWAP_MASK, CTR
   2723	mov $1, TCTR_LOW
   2724	movq TCTR_LOW, INC
   2725	movq CTR, TCTR_LOW
   2726	RET
   2727SYM_FUNC_END(_aesni_inc_init)
   2728
   2729/*
   2730 * _aesni_inc:		internal ABI
   2731 *	Increase IV by 1, IV is in big endian
   2732 * input:
   2733 *	IV
   2734 *	CTR:	== IV, in little endian
   2735 *	TCTR_LOW: == lower qword of CTR
   2736 *	INC:	== 1, in little endian
   2737 *	BSWAP_MASK == endian swapping mask
   2738 * output:
   2739 *	IV:	Increase by 1
   2740 * changed:
   2741 *	CTR:	== output IV, in little endian
   2742 *	TCTR_LOW: == lower qword of CTR
   2743 */
   2744SYM_FUNC_START_LOCAL(_aesni_inc)
   2745	paddq INC, CTR
   2746	add $1, TCTR_LOW
   2747	jnc .Linc_low
   2748	pslldq $8, INC
   2749	paddq INC, CTR
   2750	psrldq $8, INC
   2751.Linc_low:
   2752	movaps CTR, IV
   2753	pshufb BSWAP_MASK, IV
   2754	RET
   2755SYM_FUNC_END(_aesni_inc)
   2756
   2757/*
   2758 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
   2759 *		      size_t len, u8 *iv)
   2760 */
   2761SYM_FUNC_START(aesni_ctr_enc)
   2762	FRAME_BEGIN
   2763	cmp $16, LEN
   2764	jb .Lctr_enc_just_ret
   2765	mov 480(KEYP), KLEN
   2766	movups (IVP), IV
   2767	call _aesni_inc_init
   2768	cmp $64, LEN
   2769	jb .Lctr_enc_loop1
   2770.align 4
   2771.Lctr_enc_loop4:
   2772	movaps IV, STATE1
   2773	call _aesni_inc
   2774	movups (INP), IN1
   2775	movaps IV, STATE2
   2776	call _aesni_inc
   2777	movups 0x10(INP), IN2
   2778	movaps IV, STATE3
   2779	call _aesni_inc
   2780	movups 0x20(INP), IN3
   2781	movaps IV, STATE4
   2782	call _aesni_inc
   2783	movups 0x30(INP), IN4
   2784	call _aesni_enc4
   2785	pxor IN1, STATE1
   2786	movups STATE1, (OUTP)
   2787	pxor IN2, STATE2
   2788	movups STATE2, 0x10(OUTP)
   2789	pxor IN3, STATE3
   2790	movups STATE3, 0x20(OUTP)
   2791	pxor IN4, STATE4
   2792	movups STATE4, 0x30(OUTP)
   2793	sub $64, LEN
   2794	add $64, INP
   2795	add $64, OUTP
   2796	cmp $64, LEN
   2797	jge .Lctr_enc_loop4
   2798	cmp $16, LEN
   2799	jb .Lctr_enc_ret
   2800.align 4
   2801.Lctr_enc_loop1:
   2802	movaps IV, STATE
   2803	call _aesni_inc
   2804	movups (INP), IN
   2805	call _aesni_enc1
   2806	pxor IN, STATE
   2807	movups STATE, (OUTP)
   2808	sub $16, LEN
   2809	add $16, INP
   2810	add $16, OUTP
   2811	cmp $16, LEN
   2812	jge .Lctr_enc_loop1
   2813.Lctr_enc_ret:
   2814	movups IV, (IVP)
   2815.Lctr_enc_just_ret:
   2816	FRAME_END
   2817	RET
   2818SYM_FUNC_END(aesni_ctr_enc)
   2819
   2820#endif
   2821
   2822.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
   2823.align 16
   2824.Lgf128mul_x_ble_mask:
   2825	.octa 0x00000000000000010000000000000087
   2826.previous
   2827
   2828/*
   2829 * _aesni_gf128mul_x_ble:		internal ABI
   2830 *	Multiply in GF(2^128) for XTS IVs
   2831 * input:
   2832 *	IV:	current IV
   2833 *	GF128MUL_MASK == mask with 0x87 and 0x01
   2834 * output:
   2835 *	IV:	next IV
   2836 * changed:
   2837 *	CTR:	== temporary value
   2838 */
   2839#define _aesni_gf128mul_x_ble() \
   2840	pshufd $0x13, IV, KEY; \
   2841	paddq IV, IV; \
   2842	psrad $31, KEY; \
   2843	pand GF128MUL_MASK, KEY; \
   2844	pxor KEY, IV;
   2845
   2846/*
   2847 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
   2848 *			  const u8 *src, unsigned int len, le128 *iv)
   2849 */
   2850SYM_FUNC_START(aesni_xts_encrypt)
   2851	FRAME_BEGIN
   2852#ifndef __x86_64__
   2853	pushl IVP
   2854	pushl LEN
   2855	pushl KEYP
   2856	pushl KLEN
   2857	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
   2858	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
   2859	movl (FRAME_OFFSET+28)(%esp), INP	# src
   2860	movl (FRAME_OFFSET+32)(%esp), LEN	# len
   2861	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
   2862	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
   2863#else
   2864	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
   2865#endif
   2866	movups (IVP), IV
   2867
   2868	mov 480(KEYP), KLEN
   2869
   2870.Lxts_enc_loop4:
   2871	sub $64, LEN
   2872	jl .Lxts_enc_1x
   2873
   2874	movdqa IV, STATE1
   2875	movdqu 0x00(INP), IN
   2876	pxor IN, STATE1
   2877	movdqu IV, 0x00(OUTP)
   2878
   2879	_aesni_gf128mul_x_ble()
   2880	movdqa IV, STATE2
   2881	movdqu 0x10(INP), IN
   2882	pxor IN, STATE2
   2883	movdqu IV, 0x10(OUTP)
   2884
   2885	_aesni_gf128mul_x_ble()
   2886	movdqa IV, STATE3
   2887	movdqu 0x20(INP), IN
   2888	pxor IN, STATE3
   2889	movdqu IV, 0x20(OUTP)
   2890
   2891	_aesni_gf128mul_x_ble()
   2892	movdqa IV, STATE4
   2893	movdqu 0x30(INP), IN
   2894	pxor IN, STATE4
   2895	movdqu IV, 0x30(OUTP)
   2896
   2897	call _aesni_enc4
   2898
   2899	movdqu 0x00(OUTP), IN
   2900	pxor IN, STATE1
   2901	movdqu STATE1, 0x00(OUTP)
   2902
   2903	movdqu 0x10(OUTP), IN
   2904	pxor IN, STATE2
   2905	movdqu STATE2, 0x10(OUTP)
   2906
   2907	movdqu 0x20(OUTP), IN
   2908	pxor IN, STATE3
   2909	movdqu STATE3, 0x20(OUTP)
   2910
   2911	movdqu 0x30(OUTP), IN
   2912	pxor IN, STATE4
   2913	movdqu STATE4, 0x30(OUTP)
   2914
   2915	_aesni_gf128mul_x_ble()
   2916
   2917	add $64, INP
   2918	add $64, OUTP
   2919	test LEN, LEN
   2920	jnz .Lxts_enc_loop4
   2921
   2922.Lxts_enc_ret_iv:
   2923	movups IV, (IVP)
   2924
   2925.Lxts_enc_ret:
   2926#ifndef __x86_64__
   2927	popl KLEN
   2928	popl KEYP
   2929	popl LEN
   2930	popl IVP
   2931#endif
   2932	FRAME_END
   2933	RET
   2934
   2935.Lxts_enc_1x:
   2936	add $64, LEN
   2937	jz .Lxts_enc_ret_iv
   2938	sub $16, LEN
   2939	jl .Lxts_enc_cts4
   2940
   2941.Lxts_enc_loop1:
   2942	movdqu (INP), STATE
   2943	pxor IV, STATE
   2944	call _aesni_enc1
   2945	pxor IV, STATE
   2946	_aesni_gf128mul_x_ble()
   2947
   2948	test LEN, LEN
   2949	jz .Lxts_enc_out
   2950
   2951	add $16, INP
   2952	sub $16, LEN
   2953	jl .Lxts_enc_cts1
   2954
   2955	movdqu STATE, (OUTP)
   2956	add $16, OUTP
   2957	jmp .Lxts_enc_loop1
   2958
   2959.Lxts_enc_out:
   2960	movdqu STATE, (OUTP)
   2961	jmp .Lxts_enc_ret_iv
   2962
   2963.Lxts_enc_cts4:
   2964	movdqa STATE4, STATE
   2965	sub $16, OUTP
   2966
   2967.Lxts_enc_cts1:
   2968#ifndef __x86_64__
   2969	lea .Lcts_permute_table, T1
   2970#else
   2971	lea .Lcts_permute_table(%rip), T1
   2972#endif
   2973	add LEN, INP		/* rewind input pointer */
   2974	add $16, LEN		/* # bytes in final block */
   2975	movups (INP), IN1
   2976
   2977	mov T1, IVP
   2978	add $32, IVP
   2979	add LEN, T1
   2980	sub LEN, IVP
   2981	add OUTP, LEN
   2982
   2983	movups (T1), %xmm4
   2984	movaps STATE, IN2
   2985	pshufb %xmm4, STATE
   2986	movups STATE, (LEN)
   2987
   2988	movups (IVP), %xmm0
   2989	pshufb %xmm0, IN1
   2990	pblendvb IN2, IN1
   2991	movaps IN1, STATE
   2992
   2993	pxor IV, STATE
   2994	call _aesni_enc1
   2995	pxor IV, STATE
   2996
   2997	movups STATE, (OUTP)
   2998	jmp .Lxts_enc_ret
   2999SYM_FUNC_END(aesni_xts_encrypt)
   3000
   3001/*
   3002 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
   3003 *			  const u8 *src, unsigned int len, le128 *iv)
   3004 */
   3005SYM_FUNC_START(aesni_xts_decrypt)
   3006	FRAME_BEGIN
   3007#ifndef __x86_64__
   3008	pushl IVP
   3009	pushl LEN
   3010	pushl KEYP
   3011	pushl KLEN
   3012	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
   3013	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
   3014	movl (FRAME_OFFSET+28)(%esp), INP	# src
   3015	movl (FRAME_OFFSET+32)(%esp), LEN	# len
   3016	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
   3017	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
   3018#else
   3019	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
   3020#endif
   3021	movups (IVP), IV
   3022
   3023	mov 480(KEYP), KLEN
   3024	add $240, KEYP
   3025
   3026	test $15, LEN
   3027	jz .Lxts_dec_loop4
   3028	sub $16, LEN
   3029
   3030.Lxts_dec_loop4:
   3031	sub $64, LEN
   3032	jl .Lxts_dec_1x
   3033
   3034	movdqa IV, STATE1
   3035	movdqu 0x00(INP), IN
   3036	pxor IN, STATE1
   3037	movdqu IV, 0x00(OUTP)
   3038
   3039	_aesni_gf128mul_x_ble()
   3040	movdqa IV, STATE2
   3041	movdqu 0x10(INP), IN
   3042	pxor IN, STATE2
   3043	movdqu IV, 0x10(OUTP)
   3044
   3045	_aesni_gf128mul_x_ble()
   3046	movdqa IV, STATE3
   3047	movdqu 0x20(INP), IN
   3048	pxor IN, STATE3
   3049	movdqu IV, 0x20(OUTP)
   3050
   3051	_aesni_gf128mul_x_ble()
   3052	movdqa IV, STATE4
   3053	movdqu 0x30(INP), IN
   3054	pxor IN, STATE4
   3055	movdqu IV, 0x30(OUTP)
   3056
   3057	call _aesni_dec4
   3058
   3059	movdqu 0x00(OUTP), IN
   3060	pxor IN, STATE1
   3061	movdqu STATE1, 0x00(OUTP)
   3062
   3063	movdqu 0x10(OUTP), IN
   3064	pxor IN, STATE2
   3065	movdqu STATE2, 0x10(OUTP)
   3066
   3067	movdqu 0x20(OUTP), IN
   3068	pxor IN, STATE3
   3069	movdqu STATE3, 0x20(OUTP)
   3070
   3071	movdqu 0x30(OUTP), IN
   3072	pxor IN, STATE4
   3073	movdqu STATE4, 0x30(OUTP)
   3074
   3075	_aesni_gf128mul_x_ble()
   3076
   3077	add $64, INP
   3078	add $64, OUTP
   3079	test LEN, LEN
   3080	jnz .Lxts_dec_loop4
   3081
   3082.Lxts_dec_ret_iv:
   3083	movups IV, (IVP)
   3084
   3085.Lxts_dec_ret:
   3086#ifndef __x86_64__
   3087	popl KLEN
   3088	popl KEYP
   3089	popl LEN
   3090	popl IVP
   3091#endif
   3092	FRAME_END
   3093	RET
   3094
   3095.Lxts_dec_1x:
   3096	add $64, LEN
   3097	jz .Lxts_dec_ret_iv
   3098
   3099.Lxts_dec_loop1:
   3100	movdqu (INP), STATE
   3101
   3102	add $16, INP
   3103	sub $16, LEN
   3104	jl .Lxts_dec_cts1
   3105
   3106	pxor IV, STATE
   3107	call _aesni_dec1
   3108	pxor IV, STATE
   3109	_aesni_gf128mul_x_ble()
   3110
   3111	test LEN, LEN
   3112	jz .Lxts_dec_out
   3113
   3114	movdqu STATE, (OUTP)
   3115	add $16, OUTP
   3116	jmp .Lxts_dec_loop1
   3117
   3118.Lxts_dec_out:
   3119	movdqu STATE, (OUTP)
   3120	jmp .Lxts_dec_ret_iv
   3121
   3122.Lxts_dec_cts1:
   3123	movdqa IV, STATE4
   3124	_aesni_gf128mul_x_ble()
   3125
   3126	pxor IV, STATE
   3127	call _aesni_dec1
   3128	pxor IV, STATE
   3129
   3130#ifndef __x86_64__
   3131	lea .Lcts_permute_table, T1
   3132#else
   3133	lea .Lcts_permute_table(%rip), T1
   3134#endif
   3135	add LEN, INP		/* rewind input pointer */
   3136	add $16, LEN		/* # bytes in final block */
   3137	movups (INP), IN1
   3138
   3139	mov T1, IVP
   3140	add $32, IVP
   3141	add LEN, T1
   3142	sub LEN, IVP
   3143	add OUTP, LEN
   3144
   3145	movups (T1), %xmm4
   3146	movaps STATE, IN2
   3147	pshufb %xmm4, STATE
   3148	movups STATE, (LEN)
   3149
   3150	movups (IVP), %xmm0
   3151	pshufb %xmm0, IN1
   3152	pblendvb IN2, IN1
   3153	movaps IN1, STATE
   3154
   3155	pxor STATE4, STATE
   3156	call _aesni_dec1
   3157	pxor STATE4, STATE
   3158
   3159	movups STATE, (OUTP)
   3160	jmp .Lxts_dec_ret
   3161SYM_FUNC_END(aesni_xts_decrypt)