cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sm4-aesni-avx-asm_64.S (18072B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * SM4 Cipher Algorithm, AES-NI/AVX optimized.
      4 * as specified in
      5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
      6 *
      7 * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
      8 * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
      9 * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
     10 */
     11
     12/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
     13 *  https://github.com/mjosaarinen/sm4ni
     14 */
     15
     16#include <linux/linkage.h>
     17#include <asm/frame.h>
     18
     19#define rRIP         (%rip)
     20
     21#define RX0          %xmm0
     22#define RX1          %xmm1
     23#define MASK_4BIT    %xmm2
     24#define RTMP0        %xmm3
     25#define RTMP1        %xmm4
     26#define RTMP2        %xmm5
     27#define RTMP3        %xmm6
     28#define RTMP4        %xmm7
     29
     30#define RA0          %xmm8
     31#define RA1          %xmm9
     32#define RA2          %xmm10
     33#define RA3          %xmm11
     34
     35#define RB0          %xmm12
     36#define RB1          %xmm13
     37#define RB2          %xmm14
     38#define RB3          %xmm15
     39
     40#define RNOT         %xmm0
     41#define RBSWAP       %xmm1
     42
     43
     44/* Transpose four 32-bit words between 128-bit vectors. */
     45#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
     46	vpunpckhdq x1, x0, t2;                \
     47	vpunpckldq x1, x0, x0;                \
     48	                                      \
     49	vpunpckldq x3, x2, t1;                \
     50	vpunpckhdq x3, x2, x2;                \
     51	                                      \
     52	vpunpckhqdq t1, x0, x1;               \
     53	vpunpcklqdq t1, x0, x0;               \
     54	                                      \
     55	vpunpckhqdq x2, t2, x3;               \
     56	vpunpcklqdq x2, t2, x2;
     57
     58/* pre-SubByte transform. */
     59#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
     60	vpand x, mask4bit, tmp0;                     \
     61	vpandn x, mask4bit, x;                       \
     62	vpsrld $4, x, x;                             \
     63	                                             \
     64	vpshufb tmp0, lo_t, tmp0;                    \
     65	vpshufb x, hi_t, x;                          \
     66	vpxor tmp0, x, x;
     67
     68/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
     69 * 'vaeslastenc' instruction.
     70 */
     71#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
     72	vpandn mask4bit, x, tmp0;                     \
     73	vpsrld $4, x, x;                              \
     74	vpand x, mask4bit, x;                         \
     75	                                              \
     76	vpshufb tmp0, lo_t, tmp0;                     \
     77	vpshufb x, hi_t, x;                           \
     78	vpxor tmp0, x, x;
     79
     80
     81.section	.rodata.cst16, "aM", @progbits, 16
     82.align 16
     83
     84/*
     85 * Following four affine transform look-up tables are from work by
     86 * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
     87 *
     88 * These allow exposing SM4 S-Box from AES SubByte.
     89 */
     90
     91/* pre-SubByte affine transform, from SM4 field to AES field. */
     92.Lpre_tf_lo_s:
     93	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
     94.Lpre_tf_hi_s:
     95	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
     96
     97/* post-SubByte affine transform, from AES field to SM4 field. */
     98.Lpost_tf_lo_s:
     99	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
    100.Lpost_tf_hi_s:
    101	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
    102
    103/* For isolating SubBytes from AESENCLAST, inverse shift row */
    104.Linv_shift_row:
    105	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
    106	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
    107
    108/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
    109.Linv_shift_row_rol_8:
    110	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
    111	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
    112
    113/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
    114.Linv_shift_row_rol_16:
    115	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
    116	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
    117
    118/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
    119.Linv_shift_row_rol_24:
    120	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
    121	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
    122
    123/* For CTR-mode IV byteswap */
    124.Lbswap128_mask:
    125	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
    126
    127/* For input word byte-swap */
    128.Lbswap32_mask:
    129	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
    130
    131.align 4
    132/* 4-bit mask */
    133.L0f0f0f0f:
    134	.long 0x0f0f0f0f
    135
    136/* 12 bytes, only for padding */
    137.Lpadding_deadbeef:
    138	.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
    139
    140
    141.text
    142.align 16
    143
    144/*
    145 * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
    146 *                           const u8 *src, int nblocks)
    147 */
    148.align 8
    149SYM_FUNC_START(sm4_aesni_avx_crypt4)
    150	/* input:
    151	 *	%rdi: round key array, CTX
    152	 *	%rsi: dst (1..4 blocks)
    153	 *	%rdx: src (1..4 blocks)
    154	 *	%rcx: num blocks (1..4)
    155	 */
    156	FRAME_BEGIN
    157
    158	vmovdqu 0*16(%rdx), RA0;
    159	vmovdqa RA0, RA1;
    160	vmovdqa RA0, RA2;
    161	vmovdqa RA0, RA3;
    162	cmpq $2, %rcx;
    163	jb .Lblk4_load_input_done;
    164	vmovdqu 1*16(%rdx), RA1;
    165	je .Lblk4_load_input_done;
    166	vmovdqu 2*16(%rdx), RA2;
    167	cmpq $3, %rcx;
    168	je .Lblk4_load_input_done;
    169	vmovdqu 3*16(%rdx), RA3;
    170
    171.Lblk4_load_input_done:
    172
    173	vmovdqa .Lbswap32_mask rRIP, RTMP2;
    174	vpshufb RTMP2, RA0, RA0;
    175	vpshufb RTMP2, RA1, RA1;
    176	vpshufb RTMP2, RA2, RA2;
    177	vpshufb RTMP2, RA3, RA3;
    178
    179	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
    180	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
    181	vmovdqa .Lpre_tf_hi_s rRIP, RB0;
    182	vmovdqa .Lpost_tf_lo_s rRIP, RB1;
    183	vmovdqa .Lpost_tf_hi_s rRIP, RB2;
    184	vmovdqa .Linv_shift_row rRIP, RB3;
    185	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
    186	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
    187	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
    188
    189#define ROUND(round, s0, s1, s2, s3)                                \
    190	vbroadcastss (4*(round))(%rdi), RX0;                        \
    191	vpxor s1, RX0, RX0;                                         \
    192	vpxor s2, RX0, RX0;                                         \
    193	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
    194	                                                            \
    195	/* sbox, non-linear part */                                 \
    196	transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0);           \
    197	vaesenclast MASK_4BIT, RX0, RX0;                            \
    198	transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0);            \
    199	                                                            \
    200	/* linear part */                                           \
    201	vpshufb RB3, RX0, RTMP0;                                    \
    202	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
    203	vpshufb RTMP2, RX0, RTMP1;                                  \
    204	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
    205	vpshufb RTMP3, RX0, RTMP1;                                  \
    206	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
    207	vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1;            \
    208	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
    209	vpslld $2, RTMP0, RTMP1;                                    \
    210	vpsrld $30, RTMP0, RTMP0;                                   \
    211	vpxor RTMP0, s0, s0;                                        \
    212	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
    213	vpxor RTMP1, s0, s0;
    214
    215	leaq (32*4)(%rdi), %rax;
    216.align 16
    217.Lroundloop_blk4:
    218	ROUND(0, RA0, RA1, RA2, RA3);
    219	ROUND(1, RA1, RA2, RA3, RA0);
    220	ROUND(2, RA2, RA3, RA0, RA1);
    221	ROUND(3, RA3, RA0, RA1, RA2);
    222	leaq (4*4)(%rdi), %rdi;
    223	cmpq %rax, %rdi;
    224	jne .Lroundloop_blk4;
    225
    226#undef ROUND
    227
    228	vmovdqa .Lbswap128_mask rRIP, RTMP2;
    229
    230	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
    231	vpshufb RTMP2, RA0, RA0;
    232	vpshufb RTMP2, RA1, RA1;
    233	vpshufb RTMP2, RA2, RA2;
    234	vpshufb RTMP2, RA3, RA3;
    235
    236	vmovdqu RA0, 0*16(%rsi);
    237	cmpq $2, %rcx;
    238	jb .Lblk4_store_output_done;
    239	vmovdqu RA1, 1*16(%rsi);
    240	je .Lblk4_store_output_done;
    241	vmovdqu RA2, 2*16(%rsi);
    242	cmpq $3, %rcx;
    243	je .Lblk4_store_output_done;
    244	vmovdqu RA3, 3*16(%rsi);
    245
    246.Lblk4_store_output_done:
    247	vzeroall;
    248	FRAME_END
    249	RET;
    250SYM_FUNC_END(sm4_aesni_avx_crypt4)
    251
    252.align 8
    253SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
    254	/* input:
    255	 *	%rdi: round key array, CTX
    256	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
    257	 *						plaintext blocks
    258	 * output:
    259	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
    260	 * 						ciphertext blocks
    261	 */
    262	FRAME_BEGIN
    263
    264	vmovdqa .Lbswap32_mask rRIP, RTMP2;
    265	vpshufb RTMP2, RA0, RA0;
    266	vpshufb RTMP2, RA1, RA1;
    267	vpshufb RTMP2, RA2, RA2;
    268	vpshufb RTMP2, RA3, RA3;
    269	vpshufb RTMP2, RB0, RB0;
    270	vpshufb RTMP2, RB1, RB1;
    271	vpshufb RTMP2, RB2, RB2;
    272	vpshufb RTMP2, RB3, RB3;
    273
    274	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
    275	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
    276	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
    277
    278#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3)                \
    279	vbroadcastss (4*(round))(%rdi), RX0;                        \
    280	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;                          \
    281	vmovdqa .Lpre_tf_hi_s rRIP, RTMP1;                          \
    282	vmovdqa RX0, RX1;                                           \
    283	vpxor s1, RX0, RX0;                                         \
    284	vpxor s2, RX0, RX0;                                         \
    285	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
    286	vmovdqa .Lpost_tf_lo_s rRIP, RTMP2;                         \
    287	vmovdqa .Lpost_tf_hi_s rRIP, RTMP3;                         \
    288	vpxor r1, RX1, RX1;                                         \
    289	vpxor r2, RX1, RX1;                                         \
    290	vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */                 \
    291                                                                    \
    292	/* sbox, non-linear part */                                 \
    293	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
    294	transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
    295	vmovdqa .Linv_shift_row rRIP, RTMP4;                        \
    296	vaesenclast MASK_4BIT, RX0, RX0;                            \
    297	vaesenclast MASK_4BIT, RX1, RX1;                            \
    298	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
    299	transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
    300                                                                    \
    301	/* linear part */                                           \
    302	vpshufb RTMP4, RX0, RTMP0;                                  \
    303	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
    304	vpshufb RTMP4, RX1, RTMP2;                                  \
    305	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4;                  \
    306	vpxor RTMP2, r0, r0; /* r0 ^ x */                           \
    307	vpshufb RTMP4, RX0, RTMP1;                                  \
    308	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
    309	vpshufb RTMP4, RX1, RTMP3;                                  \
    310	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4;                 \
    311	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */               \
    312	vpshufb RTMP4, RX0, RTMP1;                                  \
    313	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
    314	vpshufb RTMP4, RX1, RTMP3;                                  \
    315	vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4;                 \
    316	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */   \
    317	vpshufb RTMP4, RX0, RTMP1;                                  \
    318	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
    319	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
    320	vpslld $2, RTMP0, RTMP1;                                    \
    321	vpsrld $30, RTMP0, RTMP0;                                   \
    322	vpxor RTMP0, s0, s0;                                        \
    323	vpxor RTMP1, s0, s0;                                        \
    324	vpshufb RTMP4, RX1, RTMP3;                                  \
    325	vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */               \
    326	/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
    327	vpslld $2, RTMP2, RTMP3;                                    \
    328	vpsrld $30, RTMP2, RTMP2;                                   \
    329	vpxor RTMP2, r0, r0;                                        \
    330	vpxor RTMP3, r0, r0;
    331
    332	leaq (32*4)(%rdi), %rax;
    333.align 16
    334.Lroundloop_blk8:
    335	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
    336	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
    337	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
    338	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
    339	leaq (4*4)(%rdi), %rdi;
    340	cmpq %rax, %rdi;
    341	jne .Lroundloop_blk8;
    342
    343#undef ROUND
    344
    345	vmovdqa .Lbswap128_mask rRIP, RTMP2;
    346
    347	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
    348	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
    349	vpshufb RTMP2, RA0, RA0;
    350	vpshufb RTMP2, RA1, RA1;
    351	vpshufb RTMP2, RA2, RA2;
    352	vpshufb RTMP2, RA3, RA3;
    353	vpshufb RTMP2, RB0, RB0;
    354	vpshufb RTMP2, RB1, RB1;
    355	vpshufb RTMP2, RB2, RB2;
    356	vpshufb RTMP2, RB3, RB3;
    357
    358	FRAME_END
    359	RET;
    360SYM_FUNC_END(__sm4_crypt_blk8)
    361
    362/*
    363 * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
    364 *                           const u8 *src, int nblocks)
    365 */
    366.align 8
    367SYM_FUNC_START(sm4_aesni_avx_crypt8)
    368	/* input:
    369	 *	%rdi: round key array, CTX
    370	 *	%rsi: dst (1..8 blocks)
    371	 *	%rdx: src (1..8 blocks)
    372	 *	%rcx: num blocks (1..8)
    373	 */
    374	cmpq $5, %rcx;
    375	jb sm4_aesni_avx_crypt4;
    376
    377	FRAME_BEGIN
    378
    379	vmovdqu (0 * 16)(%rdx), RA0;
    380	vmovdqu (1 * 16)(%rdx), RA1;
    381	vmovdqu (2 * 16)(%rdx), RA2;
    382	vmovdqu (3 * 16)(%rdx), RA3;
    383	vmovdqu (4 * 16)(%rdx), RB0;
    384	vmovdqa RB0, RB1;
    385	vmovdqa RB0, RB2;
    386	vmovdqa RB0, RB3;
    387	je .Lblk8_load_input_done;
    388	vmovdqu (5 * 16)(%rdx), RB1;
    389	cmpq $7, %rcx;
    390	jb .Lblk8_load_input_done;
    391	vmovdqu (6 * 16)(%rdx), RB2;
    392	je .Lblk8_load_input_done;
    393	vmovdqu (7 * 16)(%rdx), RB3;
    394
    395.Lblk8_load_input_done:
    396	call __sm4_crypt_blk8;
    397
    398	cmpq $6, %rcx;
    399	vmovdqu RA0, (0 * 16)(%rsi);
    400	vmovdqu RA1, (1 * 16)(%rsi);
    401	vmovdqu RA2, (2 * 16)(%rsi);
    402	vmovdqu RA3, (3 * 16)(%rsi);
    403	vmovdqu RB0, (4 * 16)(%rsi);
    404	jb .Lblk8_store_output_done;
    405	vmovdqu RB1, (5 * 16)(%rsi);
    406	je .Lblk8_store_output_done;
    407	vmovdqu RB2, (6 * 16)(%rsi);
    408	cmpq $7, %rcx;
    409	je .Lblk8_store_output_done;
    410	vmovdqu RB3, (7 * 16)(%rsi);
    411
    412.Lblk8_store_output_done:
    413	vzeroall;
    414	FRAME_END
    415	RET;
    416SYM_FUNC_END(sm4_aesni_avx_crypt8)
    417
    418/*
    419 * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
    420 *                                 const u8 *src, u8 *iv)
    421 */
    422.align 8
    423SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
    424	/* input:
    425	 *	%rdi: round key array, CTX
    426	 *	%rsi: dst (8 blocks)
    427	 *	%rdx: src (8 blocks)
    428	 *	%rcx: iv (big endian, 128bit)
    429	 */
    430	FRAME_BEGIN
    431
    432	/* load IV and byteswap */
    433	vmovdqu (%rcx), RA0;
    434
    435	vmovdqa .Lbswap128_mask rRIP, RBSWAP;
    436	vpshufb RBSWAP, RA0, RTMP0; /* be => le */
    437
    438	vpcmpeqd RNOT, RNOT, RNOT;
    439	vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
    440
    441#define inc_le128(x, minus_one, tmp) \
    442	vpcmpeqq minus_one, x, tmp;  \
    443	vpsubq minus_one, x, x;      \
    444	vpslldq $8, tmp, tmp;        \
    445	vpsubq tmp, x, x;
    446
    447	/* construct IVs */
    448	inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
    449	vpshufb RBSWAP, RTMP0, RA1;
    450	inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
    451	vpshufb RBSWAP, RTMP0, RA2;
    452	inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
    453	vpshufb RBSWAP, RTMP0, RA3;
    454	inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
    455	vpshufb RBSWAP, RTMP0, RB0;
    456	inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
    457	vpshufb RBSWAP, RTMP0, RB1;
    458	inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
    459	vpshufb RBSWAP, RTMP0, RB2;
    460	inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
    461	vpshufb RBSWAP, RTMP0, RB3;
    462	inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
    463	vpshufb RBSWAP, RTMP0, RTMP1;
    464
    465	/* store new IV */
    466	vmovdqu RTMP1, (%rcx);
    467
    468	call __sm4_crypt_blk8;
    469
    470	vpxor (0 * 16)(%rdx), RA0, RA0;
    471	vpxor (1 * 16)(%rdx), RA1, RA1;
    472	vpxor (2 * 16)(%rdx), RA2, RA2;
    473	vpxor (3 * 16)(%rdx), RA3, RA3;
    474	vpxor (4 * 16)(%rdx), RB0, RB0;
    475	vpxor (5 * 16)(%rdx), RB1, RB1;
    476	vpxor (6 * 16)(%rdx), RB2, RB2;
    477	vpxor (7 * 16)(%rdx), RB3, RB3;
    478
    479	vmovdqu RA0, (0 * 16)(%rsi);
    480	vmovdqu RA1, (1 * 16)(%rsi);
    481	vmovdqu RA2, (2 * 16)(%rsi);
    482	vmovdqu RA3, (3 * 16)(%rsi);
    483	vmovdqu RB0, (4 * 16)(%rsi);
    484	vmovdqu RB1, (5 * 16)(%rsi);
    485	vmovdqu RB2, (6 * 16)(%rsi);
    486	vmovdqu RB3, (7 * 16)(%rsi);
    487
    488	vzeroall;
    489	FRAME_END
    490	RET;
    491SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
    492
    493/*
    494 * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
    495 *                                 const u8 *src, u8 *iv)
    496 */
    497.align 8
    498SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
    499	/* input:
    500	 *	%rdi: round key array, CTX
    501	 *	%rsi: dst (8 blocks)
    502	 *	%rdx: src (8 blocks)
    503	 *	%rcx: iv
    504	 */
    505	FRAME_BEGIN
    506
    507	vmovdqu (0 * 16)(%rdx), RA0;
    508	vmovdqu (1 * 16)(%rdx), RA1;
    509	vmovdqu (2 * 16)(%rdx), RA2;
    510	vmovdqu (3 * 16)(%rdx), RA3;
    511	vmovdqu (4 * 16)(%rdx), RB0;
    512	vmovdqu (5 * 16)(%rdx), RB1;
    513	vmovdqu (6 * 16)(%rdx), RB2;
    514	vmovdqu (7 * 16)(%rdx), RB3;
    515
    516	call __sm4_crypt_blk8;
    517
    518	vmovdqu (7 * 16)(%rdx), RNOT;
    519	vpxor (%rcx), RA0, RA0;
    520	vpxor (0 * 16)(%rdx), RA1, RA1;
    521	vpxor (1 * 16)(%rdx), RA2, RA2;
    522	vpxor (2 * 16)(%rdx), RA3, RA3;
    523	vpxor (3 * 16)(%rdx), RB0, RB0;
    524	vpxor (4 * 16)(%rdx), RB1, RB1;
    525	vpxor (5 * 16)(%rdx), RB2, RB2;
    526	vpxor (6 * 16)(%rdx), RB3, RB3;
    527	vmovdqu RNOT, (%rcx); /* store new IV */
    528
    529	vmovdqu RA0, (0 * 16)(%rsi);
    530	vmovdqu RA1, (1 * 16)(%rsi);
    531	vmovdqu RA2, (2 * 16)(%rsi);
    532	vmovdqu RA3, (3 * 16)(%rsi);
    533	vmovdqu RB0, (4 * 16)(%rsi);
    534	vmovdqu RB1, (5 * 16)(%rsi);
    535	vmovdqu RB2, (6 * 16)(%rsi);
    536	vmovdqu RB3, (7 * 16)(%rsi);
    537
    538	vzeroall;
    539	FRAME_END
    540	RET;
    541SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
    542
    543/*
    544 * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
    545 *                                 const u8 *src, u8 *iv)
    546 */
    547.align 8
    548SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
    549	/* input:
    550	 *	%rdi: round key array, CTX
    551	 *	%rsi: dst (8 blocks)
    552	 *	%rdx: src (8 blocks)
    553	 *	%rcx: iv
    554	 */
    555	FRAME_BEGIN
    556
    557	/* Load input */
    558	vmovdqu (%rcx), RA0;
    559	vmovdqu 0 * 16(%rdx), RA1;
    560	vmovdqu 1 * 16(%rdx), RA2;
    561	vmovdqu 2 * 16(%rdx), RA3;
    562	vmovdqu 3 * 16(%rdx), RB0;
    563	vmovdqu 4 * 16(%rdx), RB1;
    564	vmovdqu 5 * 16(%rdx), RB2;
    565	vmovdqu 6 * 16(%rdx), RB3;
    566
    567	/* Update IV */
    568	vmovdqu 7 * 16(%rdx), RNOT;
    569	vmovdqu RNOT, (%rcx);
    570
    571	call __sm4_crypt_blk8;
    572
    573	vpxor (0 * 16)(%rdx), RA0, RA0;
    574	vpxor (1 * 16)(%rdx), RA1, RA1;
    575	vpxor (2 * 16)(%rdx), RA2, RA2;
    576	vpxor (3 * 16)(%rdx), RA3, RA3;
    577	vpxor (4 * 16)(%rdx), RB0, RB0;
    578	vpxor (5 * 16)(%rdx), RB1, RB1;
    579	vpxor (6 * 16)(%rdx), RB2, RB2;
    580	vpxor (7 * 16)(%rdx), RB3, RB3;
    581
    582	vmovdqu RA0, (0 * 16)(%rsi);
    583	vmovdqu RA1, (1 * 16)(%rsi);
    584	vmovdqu RA2, (2 * 16)(%rsi);
    585	vmovdqu RA3, (3 * 16)(%rsi);
    586	vmovdqu RB0, (4 * 16)(%rsi);
    587	vmovdqu RB1, (5 * 16)(%rsi);
    588	vmovdqu RB2, (6 * 16)(%rsi);
    589	vmovdqu RB3, (7 * 16)(%rsi);
    590
    591	vzeroall;
    592	FRAME_END
    593	RET;
    594SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)