sm4-aesni-avx-asm_64.S (18072B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm, AES-NI/AVX optimized. 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi> 8 * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 10 */ 11 12/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: 13 * https://github.com/mjosaarinen/sm4ni 14 */ 15 16#include <linux/linkage.h> 17#include <asm/frame.h> 18 19#define rRIP (%rip) 20 21#define RX0 %xmm0 22#define RX1 %xmm1 23#define MASK_4BIT %xmm2 24#define RTMP0 %xmm3 25#define RTMP1 %xmm4 26#define RTMP2 %xmm5 27#define RTMP3 %xmm6 28#define RTMP4 %xmm7 29 30#define RA0 %xmm8 31#define RA1 %xmm9 32#define RA2 %xmm10 33#define RA3 %xmm11 34 35#define RB0 %xmm12 36#define RB1 %xmm13 37#define RB2 %xmm14 38#define RB3 %xmm15 39 40#define RNOT %xmm0 41#define RBSWAP %xmm1 42 43 44/* Transpose four 32-bit words between 128-bit vectors. */ 45#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 46 vpunpckhdq x1, x0, t2; \ 47 vpunpckldq x1, x0, x0; \ 48 \ 49 vpunpckldq x3, x2, t1; \ 50 vpunpckhdq x3, x2, x2; \ 51 \ 52 vpunpckhqdq t1, x0, x1; \ 53 vpunpcklqdq t1, x0, x0; \ 54 \ 55 vpunpckhqdq x2, t2, x3; \ 56 vpunpcklqdq x2, t2, x2; 57 58/* pre-SubByte transform. */ 59#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ 60 vpand x, mask4bit, tmp0; \ 61 vpandn x, mask4bit, x; \ 62 vpsrld $4, x, x; \ 63 \ 64 vpshufb tmp0, lo_t, tmp0; \ 65 vpshufb x, hi_t, x; \ 66 vpxor tmp0, x, x; 67 68/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by 69 * 'vaeslastenc' instruction. 70 */ 71#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ 72 vpandn mask4bit, x, tmp0; \ 73 vpsrld $4, x, x; \ 74 vpand x, mask4bit, x; \ 75 \ 76 vpshufb tmp0, lo_t, tmp0; \ 77 vpshufb x, hi_t, x; \ 78 vpxor tmp0, x, x; 79 80 81.section .rodata.cst16, "aM", @progbits, 16 82.align 16 83 84/* 85 * Following four affine transform look-up tables are from work by 86 * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni 87 * 88 * These allow exposing SM4 S-Box from AES SubByte. 89 */ 90 91/* pre-SubByte affine transform, from SM4 field to AES field. */ 92.Lpre_tf_lo_s: 93 .quad 0x9197E2E474720701, 0xC7C1B4B222245157 94.Lpre_tf_hi_s: 95 .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 96 97/* post-SubByte affine transform, from AES field to SM4 field. */ 98.Lpost_tf_lo_s: 99 .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 100.Lpost_tf_hi_s: 101 .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF 102 103/* For isolating SubBytes from AESENCLAST, inverse shift row */ 104.Linv_shift_row: 105 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 106 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 107 108/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ 109.Linv_shift_row_rol_8: 110 .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e 111 .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 112 113/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ 114.Linv_shift_row_rol_16: 115 .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 116 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 117 118/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ 119.Linv_shift_row_rol_24: 120 .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 121 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c 122 123/* For CTR-mode IV byteswap */ 124.Lbswap128_mask: 125 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 126 127/* For input word byte-swap */ 128.Lbswap32_mask: 129 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 130 131.align 4 132/* 4-bit mask */ 133.L0f0f0f0f: 134 .long 0x0f0f0f0f 135 136/* 12 bytes, only for padding */ 137.Lpadding_deadbeef: 138 .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef 139 140 141.text 142.align 16 143 144/* 145 * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, 146 * const u8 *src, int nblocks) 147 */ 148.align 8 149SYM_FUNC_START(sm4_aesni_avx_crypt4) 150 /* input: 151 * %rdi: round key array, CTX 152 * %rsi: dst (1..4 blocks) 153 * %rdx: src (1..4 blocks) 154 * %rcx: num blocks (1..4) 155 */ 156 FRAME_BEGIN 157 158 vmovdqu 0*16(%rdx), RA0; 159 vmovdqa RA0, RA1; 160 vmovdqa RA0, RA2; 161 vmovdqa RA0, RA3; 162 cmpq $2, %rcx; 163 jb .Lblk4_load_input_done; 164 vmovdqu 1*16(%rdx), RA1; 165 je .Lblk4_load_input_done; 166 vmovdqu 2*16(%rdx), RA2; 167 cmpq $3, %rcx; 168 je .Lblk4_load_input_done; 169 vmovdqu 3*16(%rdx), RA3; 170 171.Lblk4_load_input_done: 172 173 vmovdqa .Lbswap32_mask rRIP, RTMP2; 174 vpshufb RTMP2, RA0, RA0; 175 vpshufb RTMP2, RA1, RA1; 176 vpshufb RTMP2, RA2, RA2; 177 vpshufb RTMP2, RA3, RA3; 178 179 vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; 180 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; 181 vmovdqa .Lpre_tf_hi_s rRIP, RB0; 182 vmovdqa .Lpost_tf_lo_s rRIP, RB1; 183 vmovdqa .Lpost_tf_hi_s rRIP, RB2; 184 vmovdqa .Linv_shift_row rRIP, RB3; 185 vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; 186 vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; 187 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 188 189#define ROUND(round, s0, s1, s2, s3) \ 190 vbroadcastss (4*(round))(%rdi), RX0; \ 191 vpxor s1, RX0, RX0; \ 192 vpxor s2, RX0, RX0; \ 193 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ 194 \ 195 /* sbox, non-linear part */ \ 196 transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ 197 vaesenclast MASK_4BIT, RX0, RX0; \ 198 transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ 199 \ 200 /* linear part */ \ 201 vpshufb RB3, RX0, RTMP0; \ 202 vpxor RTMP0, s0, s0; /* s0 ^ x */ \ 203 vpshufb RTMP2, RX0, RTMP1; \ 204 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ 205 vpshufb RTMP3, RX0, RTMP1; \ 206 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ 207 vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ 208 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ 209 vpslld $2, RTMP0, RTMP1; \ 210 vpsrld $30, RTMP0, RTMP0; \ 211 vpxor RTMP0, s0, s0; \ 212 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 213 vpxor RTMP1, s0, s0; 214 215 leaq (32*4)(%rdi), %rax; 216.align 16 217.Lroundloop_blk4: 218 ROUND(0, RA0, RA1, RA2, RA3); 219 ROUND(1, RA1, RA2, RA3, RA0); 220 ROUND(2, RA2, RA3, RA0, RA1); 221 ROUND(3, RA3, RA0, RA1, RA2); 222 leaq (4*4)(%rdi), %rdi; 223 cmpq %rax, %rdi; 224 jne .Lroundloop_blk4; 225 226#undef ROUND 227 228 vmovdqa .Lbswap128_mask rRIP, RTMP2; 229 230 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 231 vpshufb RTMP2, RA0, RA0; 232 vpshufb RTMP2, RA1, RA1; 233 vpshufb RTMP2, RA2, RA2; 234 vpshufb RTMP2, RA3, RA3; 235 236 vmovdqu RA0, 0*16(%rsi); 237 cmpq $2, %rcx; 238 jb .Lblk4_store_output_done; 239 vmovdqu RA1, 1*16(%rsi); 240 je .Lblk4_store_output_done; 241 vmovdqu RA2, 2*16(%rsi); 242 cmpq $3, %rcx; 243 je .Lblk4_store_output_done; 244 vmovdqu RA3, 3*16(%rsi); 245 246.Lblk4_store_output_done: 247 vzeroall; 248 FRAME_END 249 RET; 250SYM_FUNC_END(sm4_aesni_avx_crypt4) 251 252.align 8 253SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) 254 /* input: 255 * %rdi: round key array, CTX 256 * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel 257 * plaintext blocks 258 * output: 259 * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel 260 * ciphertext blocks 261 */ 262 FRAME_BEGIN 263 264 vmovdqa .Lbswap32_mask rRIP, RTMP2; 265 vpshufb RTMP2, RA0, RA0; 266 vpshufb RTMP2, RA1, RA1; 267 vpshufb RTMP2, RA2, RA2; 268 vpshufb RTMP2, RA3, RA3; 269 vpshufb RTMP2, RB0, RB0; 270 vpshufb RTMP2, RB1, RB1; 271 vpshufb RTMP2, RB2, RB2; 272 vpshufb RTMP2, RB3, RB3; 273 274 vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; 275 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 276 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); 277 278#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ 279 vbroadcastss (4*(round))(%rdi), RX0; \ 280 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ 281 vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ 282 vmovdqa RX0, RX1; \ 283 vpxor s1, RX0, RX0; \ 284 vpxor s2, RX0, RX0; \ 285 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ 286 vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ 287 vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ 288 vpxor r1, RX1, RX1; \ 289 vpxor r2, RX1, RX1; \ 290 vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ 291 \ 292 /* sbox, non-linear part */ \ 293 transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ 294 transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ 295 vmovdqa .Linv_shift_row rRIP, RTMP4; \ 296 vaesenclast MASK_4BIT, RX0, RX0; \ 297 vaesenclast MASK_4BIT, RX1, RX1; \ 298 transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ 299 transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ 300 \ 301 /* linear part */ \ 302 vpshufb RTMP4, RX0, RTMP0; \ 303 vpxor RTMP0, s0, s0; /* s0 ^ x */ \ 304 vpshufb RTMP4, RX1, RTMP2; \ 305 vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ 306 vpxor RTMP2, r0, r0; /* r0 ^ x */ \ 307 vpshufb RTMP4, RX0, RTMP1; \ 308 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ 309 vpshufb RTMP4, RX1, RTMP3; \ 310 vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ 311 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ 312 vpshufb RTMP4, RX0, RTMP1; \ 313 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ 314 vpshufb RTMP4, RX1, RTMP3; \ 315 vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ 316 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ 317 vpshufb RTMP4, RX0, RTMP1; \ 318 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ 319 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 320 vpslld $2, RTMP0, RTMP1; \ 321 vpsrld $30, RTMP0, RTMP0; \ 322 vpxor RTMP0, s0, s0; \ 323 vpxor RTMP1, s0, s0; \ 324 vpshufb RTMP4, RX1, RTMP3; \ 325 vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ 326 /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 327 vpslld $2, RTMP2, RTMP3; \ 328 vpsrld $30, RTMP2, RTMP2; \ 329 vpxor RTMP2, r0, r0; \ 330 vpxor RTMP3, r0, r0; 331 332 leaq (32*4)(%rdi), %rax; 333.align 16 334.Lroundloop_blk8: 335 ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); 336 ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); 337 ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); 338 ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); 339 leaq (4*4)(%rdi), %rdi; 340 cmpq %rax, %rdi; 341 jne .Lroundloop_blk8; 342 343#undef ROUND 344 345 vmovdqa .Lbswap128_mask rRIP, RTMP2; 346 347 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 348 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); 349 vpshufb RTMP2, RA0, RA0; 350 vpshufb RTMP2, RA1, RA1; 351 vpshufb RTMP2, RA2, RA2; 352 vpshufb RTMP2, RA3, RA3; 353 vpshufb RTMP2, RB0, RB0; 354 vpshufb RTMP2, RB1, RB1; 355 vpshufb RTMP2, RB2, RB2; 356 vpshufb RTMP2, RB3, RB3; 357 358 FRAME_END 359 RET; 360SYM_FUNC_END(__sm4_crypt_blk8) 361 362/* 363 * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, 364 * const u8 *src, int nblocks) 365 */ 366.align 8 367SYM_FUNC_START(sm4_aesni_avx_crypt8) 368 /* input: 369 * %rdi: round key array, CTX 370 * %rsi: dst (1..8 blocks) 371 * %rdx: src (1..8 blocks) 372 * %rcx: num blocks (1..8) 373 */ 374 cmpq $5, %rcx; 375 jb sm4_aesni_avx_crypt4; 376 377 FRAME_BEGIN 378 379 vmovdqu (0 * 16)(%rdx), RA0; 380 vmovdqu (1 * 16)(%rdx), RA1; 381 vmovdqu (2 * 16)(%rdx), RA2; 382 vmovdqu (3 * 16)(%rdx), RA3; 383 vmovdqu (4 * 16)(%rdx), RB0; 384 vmovdqa RB0, RB1; 385 vmovdqa RB0, RB2; 386 vmovdqa RB0, RB3; 387 je .Lblk8_load_input_done; 388 vmovdqu (5 * 16)(%rdx), RB1; 389 cmpq $7, %rcx; 390 jb .Lblk8_load_input_done; 391 vmovdqu (6 * 16)(%rdx), RB2; 392 je .Lblk8_load_input_done; 393 vmovdqu (7 * 16)(%rdx), RB3; 394 395.Lblk8_load_input_done: 396 call __sm4_crypt_blk8; 397 398 cmpq $6, %rcx; 399 vmovdqu RA0, (0 * 16)(%rsi); 400 vmovdqu RA1, (1 * 16)(%rsi); 401 vmovdqu RA2, (2 * 16)(%rsi); 402 vmovdqu RA3, (3 * 16)(%rsi); 403 vmovdqu RB0, (4 * 16)(%rsi); 404 jb .Lblk8_store_output_done; 405 vmovdqu RB1, (5 * 16)(%rsi); 406 je .Lblk8_store_output_done; 407 vmovdqu RB2, (6 * 16)(%rsi); 408 cmpq $7, %rcx; 409 je .Lblk8_store_output_done; 410 vmovdqu RB3, (7 * 16)(%rsi); 411 412.Lblk8_store_output_done: 413 vzeroall; 414 FRAME_END 415 RET; 416SYM_FUNC_END(sm4_aesni_avx_crypt8) 417 418/* 419 * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, 420 * const u8 *src, u8 *iv) 421 */ 422.align 8 423SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) 424 /* input: 425 * %rdi: round key array, CTX 426 * %rsi: dst (8 blocks) 427 * %rdx: src (8 blocks) 428 * %rcx: iv (big endian, 128bit) 429 */ 430 FRAME_BEGIN 431 432 /* load IV and byteswap */ 433 vmovdqu (%rcx), RA0; 434 435 vmovdqa .Lbswap128_mask rRIP, RBSWAP; 436 vpshufb RBSWAP, RA0, RTMP0; /* be => le */ 437 438 vpcmpeqd RNOT, RNOT, RNOT; 439 vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ 440 441#define inc_le128(x, minus_one, tmp) \ 442 vpcmpeqq minus_one, x, tmp; \ 443 vpsubq minus_one, x, x; \ 444 vpslldq $8, tmp, tmp; \ 445 vpsubq tmp, x, x; 446 447 /* construct IVs */ 448 inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ 449 vpshufb RBSWAP, RTMP0, RA1; 450 inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ 451 vpshufb RBSWAP, RTMP0, RA2; 452 inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ 453 vpshufb RBSWAP, RTMP0, RA3; 454 inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ 455 vpshufb RBSWAP, RTMP0, RB0; 456 inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ 457 vpshufb RBSWAP, RTMP0, RB1; 458 inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ 459 vpshufb RBSWAP, RTMP0, RB2; 460 inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ 461 vpshufb RBSWAP, RTMP0, RB3; 462 inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ 463 vpshufb RBSWAP, RTMP0, RTMP1; 464 465 /* store new IV */ 466 vmovdqu RTMP1, (%rcx); 467 468 call __sm4_crypt_blk8; 469 470 vpxor (0 * 16)(%rdx), RA0, RA0; 471 vpxor (1 * 16)(%rdx), RA1, RA1; 472 vpxor (2 * 16)(%rdx), RA2, RA2; 473 vpxor (3 * 16)(%rdx), RA3, RA3; 474 vpxor (4 * 16)(%rdx), RB0, RB0; 475 vpxor (5 * 16)(%rdx), RB1, RB1; 476 vpxor (6 * 16)(%rdx), RB2, RB2; 477 vpxor (7 * 16)(%rdx), RB3, RB3; 478 479 vmovdqu RA0, (0 * 16)(%rsi); 480 vmovdqu RA1, (1 * 16)(%rsi); 481 vmovdqu RA2, (2 * 16)(%rsi); 482 vmovdqu RA3, (3 * 16)(%rsi); 483 vmovdqu RB0, (4 * 16)(%rsi); 484 vmovdqu RB1, (5 * 16)(%rsi); 485 vmovdqu RB2, (6 * 16)(%rsi); 486 vmovdqu RB3, (7 * 16)(%rsi); 487 488 vzeroall; 489 FRAME_END 490 RET; 491SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) 492 493/* 494 * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, 495 * const u8 *src, u8 *iv) 496 */ 497.align 8 498SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) 499 /* input: 500 * %rdi: round key array, CTX 501 * %rsi: dst (8 blocks) 502 * %rdx: src (8 blocks) 503 * %rcx: iv 504 */ 505 FRAME_BEGIN 506 507 vmovdqu (0 * 16)(%rdx), RA0; 508 vmovdqu (1 * 16)(%rdx), RA1; 509 vmovdqu (2 * 16)(%rdx), RA2; 510 vmovdqu (3 * 16)(%rdx), RA3; 511 vmovdqu (4 * 16)(%rdx), RB0; 512 vmovdqu (5 * 16)(%rdx), RB1; 513 vmovdqu (6 * 16)(%rdx), RB2; 514 vmovdqu (7 * 16)(%rdx), RB3; 515 516 call __sm4_crypt_blk8; 517 518 vmovdqu (7 * 16)(%rdx), RNOT; 519 vpxor (%rcx), RA0, RA0; 520 vpxor (0 * 16)(%rdx), RA1, RA1; 521 vpxor (1 * 16)(%rdx), RA2, RA2; 522 vpxor (2 * 16)(%rdx), RA3, RA3; 523 vpxor (3 * 16)(%rdx), RB0, RB0; 524 vpxor (4 * 16)(%rdx), RB1, RB1; 525 vpxor (5 * 16)(%rdx), RB2, RB2; 526 vpxor (6 * 16)(%rdx), RB3, RB3; 527 vmovdqu RNOT, (%rcx); /* store new IV */ 528 529 vmovdqu RA0, (0 * 16)(%rsi); 530 vmovdqu RA1, (1 * 16)(%rsi); 531 vmovdqu RA2, (2 * 16)(%rsi); 532 vmovdqu RA3, (3 * 16)(%rsi); 533 vmovdqu RB0, (4 * 16)(%rsi); 534 vmovdqu RB1, (5 * 16)(%rsi); 535 vmovdqu RB2, (6 * 16)(%rsi); 536 vmovdqu RB3, (7 * 16)(%rsi); 537 538 vzeroall; 539 FRAME_END 540 RET; 541SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) 542 543/* 544 * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, 545 * const u8 *src, u8 *iv) 546 */ 547.align 8 548SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) 549 /* input: 550 * %rdi: round key array, CTX 551 * %rsi: dst (8 blocks) 552 * %rdx: src (8 blocks) 553 * %rcx: iv 554 */ 555 FRAME_BEGIN 556 557 /* Load input */ 558 vmovdqu (%rcx), RA0; 559 vmovdqu 0 * 16(%rdx), RA1; 560 vmovdqu 1 * 16(%rdx), RA2; 561 vmovdqu 2 * 16(%rdx), RA3; 562 vmovdqu 3 * 16(%rdx), RB0; 563 vmovdqu 4 * 16(%rdx), RB1; 564 vmovdqu 5 * 16(%rdx), RB2; 565 vmovdqu 6 * 16(%rdx), RB3; 566 567 /* Update IV */ 568 vmovdqu 7 * 16(%rdx), RNOT; 569 vmovdqu RNOT, (%rcx); 570 571 call __sm4_crypt_blk8; 572 573 vpxor (0 * 16)(%rdx), RA0, RA0; 574 vpxor (1 * 16)(%rdx), RA1, RA1; 575 vpxor (2 * 16)(%rdx), RA2, RA2; 576 vpxor (3 * 16)(%rdx), RA3, RA3; 577 vpxor (4 * 16)(%rdx), RB0, RB0; 578 vpxor (5 * 16)(%rdx), RB1, RB1; 579 vpxor (6 * 16)(%rdx), RB2, RB2; 580 vpxor (7 * 16)(%rdx), RB3, RB3; 581 582 vmovdqu RA0, (0 * 16)(%rsi); 583 vmovdqu RA1, (1 * 16)(%rsi); 584 vmovdqu RA2, (2 * 16)(%rsi); 585 vmovdqu RA3, (3 * 16)(%rsi); 586 vmovdqu RB0, (4 * 16)(%rsi); 587 vmovdqu RB1, (5 * 16)(%rsi); 588 vmovdqu RB2, (6 * 16)(%rsi); 589 vmovdqu RB3, (7 * 16)(%rsi); 590 591 vzeroall; 592 FRAME_END 593 RET; 594SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)