sm4-neon-core.S (15473B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm for ARMv8 NEON 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2022, Alibaba Group. 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14/* Register macros */ 15 16#define RTMP0 v8 17#define RTMP1 v9 18#define RTMP2 v10 19#define RTMP3 v11 20 21#define RX0 v12 22#define RX1 v13 23#define RKEY v14 24#define RIV v15 25 26/* Helper macros. */ 27 28#define PREPARE \ 29 adr_l x5, crypto_sm4_sbox; \ 30 ld1 {v16.16b-v19.16b}, [x5], #64; \ 31 ld1 {v20.16b-v23.16b}, [x5], #64; \ 32 ld1 {v24.16b-v27.16b}, [x5], #64; \ 33 ld1 {v28.16b-v31.16b}, [x5]; 34 35#define transpose_4x4(s0, s1, s2, s3) \ 36 zip1 RTMP0.4s, s0.4s, s1.4s; \ 37 zip1 RTMP1.4s, s2.4s, s3.4s; \ 38 zip2 RTMP2.4s, s0.4s, s1.4s; \ 39 zip2 RTMP3.4s, s2.4s, s3.4s; \ 40 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 41 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 42 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 43 zip2 s3.2d, RTMP2.2d, RTMP3.2d; 44 45#define rotate_clockwise_90(s0, s1, s2, s3) \ 46 zip1 RTMP0.4s, s1.4s, s0.4s; \ 47 zip2 RTMP1.4s, s1.4s, s0.4s; \ 48 zip1 RTMP2.4s, s3.4s, s2.4s; \ 49 zip2 RTMP3.4s, s3.4s, s2.4s; \ 50 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 51 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 52 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 53 zip2 s3.2d, RTMP3.2d, RTMP1.2d; 54 55#define ROUND4(round, s0, s1, s2, s3) \ 56 dup RX0.4s, RKEY.s[round]; \ 57 /* rk ^ s1 ^ s2 ^ s3 */ \ 58 eor RTMP1.16b, s2.16b, s3.16b; \ 59 eor RX0.16b, RX0.16b, s1.16b; \ 60 eor RX0.16b, RX0.16b, RTMP1.16b; \ 61 \ 62 /* sbox, non-linear part */ \ 63 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 64 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 65 sub RX0.16b, RX0.16b, RTMP3.16b; \ 66 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 67 sub RX0.16b, RX0.16b, RTMP3.16b; \ 68 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 69 sub RX0.16b, RX0.16b, RTMP3.16b; \ 70 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 71 \ 72 /* linear part */ \ 73 shl RTMP1.4s, RTMP0.4s, #8; \ 74 shl RTMP2.4s, RTMP0.4s, #16; \ 75 shl RTMP3.4s, RTMP0.4s, #24; \ 76 sri RTMP1.4s, RTMP0.4s, #(32-8); \ 77 sri RTMP2.4s, RTMP0.4s, #(32-16); \ 78 sri RTMP3.4s, RTMP0.4s, #(32-24); \ 79 /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 80 eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ 81 eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ 82 /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ 83 eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ 84 shl RTMP2.4s, RTMP1.4s, 2; \ 85 sri RTMP2.4s, RTMP1.4s, #(32-2); \ 86 eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ 87 /* s0 ^= RTMP3 */ \ 88 eor s0.16b, s0.16b, RTMP3.16b; 89 90#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 91 rev32 b0.16b, b0.16b; \ 92 rev32 b1.16b, b1.16b; \ 93 rev32 b2.16b, b2.16b; \ 94 rev32 b3.16b, b3.16b; \ 95 \ 96 transpose_4x4(b0, b1, b2, b3); \ 97 \ 98 mov x6, 8; \ 994: \ 100 ld1 {RKEY.4s}, [x0], #16; \ 101 subs x6, x6, #1; \ 102 \ 103 ROUND4(0, b0, b1, b2, b3); \ 104 ROUND4(1, b1, b2, b3, b0); \ 105 ROUND4(2, b2, b3, b0, b1); \ 106 ROUND4(3, b3, b0, b1, b2); \ 107 \ 108 bne 4b; \ 109 \ 110 rotate_clockwise_90(b0, b1, b2, b3); \ 111 rev32 b0.16b, b0.16b; \ 112 rev32 b1.16b, b1.16b; \ 113 rev32 b2.16b, b2.16b; \ 114 rev32 b3.16b, b3.16b; \ 115 \ 116 /* repoint to rkey */ \ 117 sub x0, x0, #128; 118 119#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ 120 /* rk ^ s1 ^ s2 ^ s3 */ \ 121 dup RX0.4s, RKEY.s[round]; \ 122 eor RTMP0.16b, s2.16b, s3.16b; \ 123 mov RX1.16b, RX0.16b; \ 124 eor RTMP1.16b, t2.16b, t3.16b; \ 125 eor RX0.16b, RX0.16b, s1.16b; \ 126 eor RX1.16b, RX1.16b, t1.16b; \ 127 eor RX0.16b, RX0.16b, RTMP0.16b; \ 128 eor RX1.16b, RX1.16b, RTMP1.16b; \ 129 \ 130 /* sbox, non-linear part */ \ 131 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 132 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 133 tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ 134 sub RX0.16b, RX0.16b, RTMP3.16b; \ 135 sub RX1.16b, RX1.16b, RTMP3.16b; \ 136 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 137 tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ 138 sub RX0.16b, RX0.16b, RTMP3.16b; \ 139 sub RX1.16b, RX1.16b, RTMP3.16b; \ 140 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 141 tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ 142 sub RX0.16b, RX0.16b, RTMP3.16b; \ 143 sub RX1.16b, RX1.16b, RTMP3.16b; \ 144 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 145 tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ 146 \ 147 /* linear part */ \ 148 shl RX0.4s, RTMP0.4s, #8; \ 149 shl RX1.4s, RTMP1.4s, #8; \ 150 shl RTMP2.4s, RTMP0.4s, #16; \ 151 shl RTMP3.4s, RTMP1.4s, #16; \ 152 sri RX0.4s, RTMP0.4s, #(32 - 8); \ 153 sri RX1.4s, RTMP1.4s, #(32 - 8); \ 154 sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ 155 sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ 156 /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 157 eor RX0.16b, RX0.16b, RTMP0.16b; \ 158 eor RX1.16b, RX1.16b, RTMP1.16b; \ 159 eor RX0.16b, RX0.16b, RTMP2.16b; \ 160 eor RX1.16b, RX1.16b, RTMP3.16b; \ 161 /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ 162 shl RTMP2.4s, RTMP0.4s, #24; \ 163 shl RTMP3.4s, RTMP1.4s, #24; \ 164 sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ 165 sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ 166 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 167 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 168 shl RTMP2.4s, RX0.4s, #2; \ 169 shl RTMP3.4s, RX1.4s, #2; \ 170 sri RTMP2.4s, RX0.4s, #(32 - 2); \ 171 sri RTMP3.4s, RX1.4s, #(32 - 2); \ 172 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 173 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 174 /* s0/t0 ^= RTMP0/1 */ \ 175 eor s0.16b, s0.16b, RTMP0.16b; \ 176 eor t0.16b, t0.16b, RTMP1.16b; 177 178#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 179 rev32 b0.16b, b0.16b; \ 180 rev32 b1.16b, b1.16b; \ 181 rev32 b2.16b, b2.16b; \ 182 rev32 b3.16b, b3.16b; \ 183 rev32 b4.16b, b4.16b; \ 184 rev32 b5.16b, b5.16b; \ 185 rev32 b6.16b, b6.16b; \ 186 rev32 b7.16b, b7.16b; \ 187 \ 188 transpose_4x4(b0, b1, b2, b3); \ 189 transpose_4x4(b4, b5, b6, b7); \ 190 \ 191 mov x6, 8; \ 1928: \ 193 ld1 {RKEY.4s}, [x0], #16; \ 194 subs x6, x6, #1; \ 195 \ 196 ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ 197 ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ 198 ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ 199 ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ 200 \ 201 bne 8b; \ 202 \ 203 rotate_clockwise_90(b0, b1, b2, b3); \ 204 rotate_clockwise_90(b4, b5, b6, b7); \ 205 rev32 b0.16b, b0.16b; \ 206 rev32 b1.16b, b1.16b; \ 207 rev32 b2.16b, b2.16b; \ 208 rev32 b3.16b, b3.16b; \ 209 rev32 b4.16b, b4.16b; \ 210 rev32 b5.16b, b5.16b; \ 211 rev32 b6.16b, b6.16b; \ 212 rev32 b7.16b, b7.16b; \ 213 \ 214 /* repoint to rkey */ \ 215 sub x0, x0, #128; 216 217 218.align 3 219SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4) 220 /* input: 221 * x0: round key array, CTX 222 * x1: dst 223 * x2: src 224 * w3: num blocks (1..4) 225 */ 226 PREPARE; 227 228 ld1 {v0.16b}, [x2], #16; 229 mov v1.16b, v0.16b; 230 mov v2.16b, v0.16b; 231 mov v3.16b, v0.16b; 232 cmp w3, #2; 233 blt .Lblk4_load_input_done; 234 ld1 {v1.16b}, [x2], #16; 235 beq .Lblk4_load_input_done; 236 ld1 {v2.16b}, [x2], #16; 237 cmp w3, #3; 238 beq .Lblk4_load_input_done; 239 ld1 {v3.16b}, [x2]; 240 241.Lblk4_load_input_done: 242 SM4_CRYPT_BLK4(v0, v1, v2, v3); 243 244 st1 {v0.16b}, [x1], #16; 245 cmp w3, #2; 246 blt .Lblk4_store_output_done; 247 st1 {v1.16b}, [x1], #16; 248 beq .Lblk4_store_output_done; 249 st1 {v2.16b}, [x1], #16; 250 cmp w3, #3; 251 beq .Lblk4_store_output_done; 252 st1 {v3.16b}, [x1]; 253 254.Lblk4_store_output_done: 255 ret; 256SYM_FUNC_END(__sm4_neon_crypt_blk1_4) 257 258.align 3 259SYM_FUNC_START(sm4_neon_crypt_blk1_8) 260 /* input: 261 * x0: round key array, CTX 262 * x1: dst 263 * x2: src 264 * w3: num blocks (1..8) 265 */ 266 cmp w3, #5; 267 blt __sm4_neon_crypt_blk1_4; 268 269 PREPARE; 270 271 ld1 {v0.16b-v3.16b}, [x2], #64; 272 ld1 {v4.16b}, [x2], #16; 273 mov v5.16b, v4.16b; 274 mov v6.16b, v4.16b; 275 mov v7.16b, v4.16b; 276 beq .Lblk8_load_input_done; 277 ld1 {v5.16b}, [x2], #16; 278 cmp w3, #7; 279 blt .Lblk8_load_input_done; 280 ld1 {v6.16b}, [x2], #16; 281 beq .Lblk8_load_input_done; 282 ld1 {v7.16b}, [x2]; 283 284.Lblk8_load_input_done: 285 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 286 287 cmp w3, #6; 288 st1 {v0.16b-v3.16b}, [x1], #64; 289 st1 {v4.16b}, [x1], #16; 290 blt .Lblk8_store_output_done; 291 st1 {v5.16b}, [x1], #16; 292 beq .Lblk8_store_output_done; 293 st1 {v6.16b}, [x1], #16; 294 cmp w3, #7; 295 beq .Lblk8_store_output_done; 296 st1 {v7.16b}, [x1]; 297 298.Lblk8_store_output_done: 299 ret; 300SYM_FUNC_END(sm4_neon_crypt_blk1_8) 301 302.align 3 303SYM_FUNC_START(sm4_neon_crypt_blk8) 304 /* input: 305 * x0: round key array, CTX 306 * x1: dst 307 * x2: src 308 * w3: nblocks (multiples of 8) 309 */ 310 PREPARE; 311 312.Lcrypt_loop_blk: 313 subs w3, w3, #8; 314 bmi .Lcrypt_end; 315 316 ld1 {v0.16b-v3.16b}, [x2], #64; 317 ld1 {v4.16b-v7.16b}, [x2], #64; 318 319 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 320 321 st1 {v0.16b-v3.16b}, [x1], #64; 322 st1 {v4.16b-v7.16b}, [x1], #64; 323 324 b .Lcrypt_loop_blk; 325 326.Lcrypt_end: 327 ret; 328SYM_FUNC_END(sm4_neon_crypt_blk8) 329 330.align 3 331SYM_FUNC_START(sm4_neon_cbc_dec_blk8) 332 /* input: 333 * x0: round key array, CTX 334 * x1: dst 335 * x2: src 336 * x3: iv (big endian, 128 bit) 337 * w4: nblocks (multiples of 8) 338 */ 339 PREPARE; 340 341 ld1 {RIV.16b}, [x3]; 342 343.Lcbc_loop_blk: 344 subs w4, w4, #8; 345 bmi .Lcbc_end; 346 347 ld1 {v0.16b-v3.16b}, [x2], #64; 348 ld1 {v4.16b-v7.16b}, [x2]; 349 350 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 351 352 sub x2, x2, #64; 353 eor v0.16b, v0.16b, RIV.16b; 354 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 355 eor v1.16b, v1.16b, RTMP0.16b; 356 eor v2.16b, v2.16b, RTMP1.16b; 357 eor v3.16b, v3.16b, RTMP2.16b; 358 st1 {v0.16b-v3.16b}, [x1], #64; 359 360 eor v4.16b, v4.16b, RTMP3.16b; 361 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 362 eor v5.16b, v5.16b, RTMP0.16b; 363 eor v6.16b, v6.16b, RTMP1.16b; 364 eor v7.16b, v7.16b, RTMP2.16b; 365 366 mov RIV.16b, RTMP3.16b; 367 st1 {v4.16b-v7.16b}, [x1], #64; 368 369 b .Lcbc_loop_blk; 370 371.Lcbc_end: 372 /* store new IV */ 373 st1 {RIV.16b}, [x3]; 374 375 ret; 376SYM_FUNC_END(sm4_neon_cbc_dec_blk8) 377 378.align 3 379SYM_FUNC_START(sm4_neon_cfb_dec_blk8) 380 /* input: 381 * x0: round key array, CTX 382 * x1: dst 383 * x2: src 384 * x3: iv (big endian, 128 bit) 385 * w4: nblocks (multiples of 8) 386 */ 387 PREPARE; 388 389 ld1 {v0.16b}, [x3]; 390 391.Lcfb_loop_blk: 392 subs w4, w4, #8; 393 bmi .Lcfb_end; 394 395 ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; 396 ld1 {v4.16b-v7.16b}, [x2]; 397 398 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 399 400 sub x2, x2, #48; 401 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 402 eor v0.16b, v0.16b, RTMP0.16b; 403 eor v1.16b, v1.16b, RTMP1.16b; 404 eor v2.16b, v2.16b, RTMP2.16b; 405 eor v3.16b, v3.16b, RTMP3.16b; 406 st1 {v0.16b-v3.16b}, [x1], #64; 407 408 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 409 eor v4.16b, v4.16b, RTMP0.16b; 410 eor v5.16b, v5.16b, RTMP1.16b; 411 eor v6.16b, v6.16b, RTMP2.16b; 412 eor v7.16b, v7.16b, RTMP3.16b; 413 st1 {v4.16b-v7.16b}, [x1], #64; 414 415 mov v0.16b, RTMP3.16b; 416 417 b .Lcfb_loop_blk; 418 419.Lcfb_end: 420 /* store new IV */ 421 st1 {v0.16b}, [x3]; 422 423 ret; 424SYM_FUNC_END(sm4_neon_cfb_dec_blk8) 425 426.align 3 427SYM_FUNC_START(sm4_neon_ctr_enc_blk8) 428 /* input: 429 * x0: round key array, CTX 430 * x1: dst 431 * x2: src 432 * x3: ctr (big endian, 128 bit) 433 * w4: nblocks (multiples of 8) 434 */ 435 PREPARE; 436 437 ldp x7, x8, [x3]; 438 rev x7, x7; 439 rev x8, x8; 440 441.Lctr_loop_blk: 442 subs w4, w4, #8; 443 bmi .Lctr_end; 444 445#define inc_le128(vctr) \ 446 mov vctr.d[1], x8; \ 447 mov vctr.d[0], x7; \ 448 adds x8, x8, #1; \ 449 adc x7, x7, xzr; \ 450 rev64 vctr.16b, vctr.16b; 451 452 /* construct CTRs */ 453 inc_le128(v0); /* +0 */ 454 inc_le128(v1); /* +1 */ 455 inc_le128(v2); /* +2 */ 456 inc_le128(v3); /* +3 */ 457 inc_le128(v4); /* +4 */ 458 inc_le128(v5); /* +5 */ 459 inc_le128(v6); /* +6 */ 460 inc_le128(v7); /* +7 */ 461 462 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 463 464 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 465 eor v0.16b, v0.16b, RTMP0.16b; 466 eor v1.16b, v1.16b, RTMP1.16b; 467 eor v2.16b, v2.16b, RTMP2.16b; 468 eor v3.16b, v3.16b, RTMP3.16b; 469 st1 {v0.16b-v3.16b}, [x1], #64; 470 471 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 472 eor v4.16b, v4.16b, RTMP0.16b; 473 eor v5.16b, v5.16b, RTMP1.16b; 474 eor v6.16b, v6.16b, RTMP2.16b; 475 eor v7.16b, v7.16b, RTMP3.16b; 476 st1 {v4.16b-v7.16b}, [x1], #64; 477 478 b .Lctr_loop_blk; 479 480.Lctr_end: 481 /* store new CTR */ 482 rev x7, x7; 483 rev x8, x8; 484 stp x7, x8, [x3]; 485 486 ret; 487SYM_FUNC_END(sm4_neon_ctr_enc_blk8)