sm4-ce-core.S (15378B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2022, Alibaba Group. 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14.arch armv8-a+crypto 15 16.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31 17 .set .Lv\b\().4s, \b 18.endr 19 20.macro sm4e, vd, vn 21 .inst 0xcec08400 | (.L\vn << 5) | .L\vd 22.endm 23 24.macro sm4ekey, vd, vn, vm 25 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd 26.endm 27 28/* Register macros */ 29 30#define RTMP0 v16 31#define RTMP1 v17 32#define RTMP2 v18 33#define RTMP3 v19 34 35#define RIV v20 36 37/* Helper macros. */ 38 39#define PREPARE \ 40 ld1 {v24.16b-v27.16b}, [x0], #64; \ 41 ld1 {v28.16b-v31.16b}, [x0]; 42 43#define SM4_CRYPT_BLK(b0) \ 44 rev32 b0.16b, b0.16b; \ 45 sm4e b0.4s, v24.4s; \ 46 sm4e b0.4s, v25.4s; \ 47 sm4e b0.4s, v26.4s; \ 48 sm4e b0.4s, v27.4s; \ 49 sm4e b0.4s, v28.4s; \ 50 sm4e b0.4s, v29.4s; \ 51 sm4e b0.4s, v30.4s; \ 52 sm4e b0.4s, v31.4s; \ 53 rev64 b0.4s, b0.4s; \ 54 ext b0.16b, b0.16b, b0.16b, #8; \ 55 rev32 b0.16b, b0.16b; 56 57#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 58 rev32 b0.16b, b0.16b; \ 59 rev32 b1.16b, b1.16b; \ 60 rev32 b2.16b, b2.16b; \ 61 rev32 b3.16b, b3.16b; \ 62 sm4e b0.4s, v24.4s; \ 63 sm4e b1.4s, v24.4s; \ 64 sm4e b2.4s, v24.4s; \ 65 sm4e b3.4s, v24.4s; \ 66 sm4e b0.4s, v25.4s; \ 67 sm4e b1.4s, v25.4s; \ 68 sm4e b2.4s, v25.4s; \ 69 sm4e b3.4s, v25.4s; \ 70 sm4e b0.4s, v26.4s; \ 71 sm4e b1.4s, v26.4s; \ 72 sm4e b2.4s, v26.4s; \ 73 sm4e b3.4s, v26.4s; \ 74 sm4e b0.4s, v27.4s; \ 75 sm4e b1.4s, v27.4s; \ 76 sm4e b2.4s, v27.4s; \ 77 sm4e b3.4s, v27.4s; \ 78 sm4e b0.4s, v28.4s; \ 79 sm4e b1.4s, v28.4s; \ 80 sm4e b2.4s, v28.4s; \ 81 sm4e b3.4s, v28.4s; \ 82 sm4e b0.4s, v29.4s; \ 83 sm4e b1.4s, v29.4s; \ 84 sm4e b2.4s, v29.4s; \ 85 sm4e b3.4s, v29.4s; \ 86 sm4e b0.4s, v30.4s; \ 87 sm4e b1.4s, v30.4s; \ 88 sm4e b2.4s, v30.4s; \ 89 sm4e b3.4s, v30.4s; \ 90 sm4e b0.4s, v31.4s; \ 91 sm4e b1.4s, v31.4s; \ 92 sm4e b2.4s, v31.4s; \ 93 sm4e b3.4s, v31.4s; \ 94 rev64 b0.4s, b0.4s; \ 95 rev64 b1.4s, b1.4s; \ 96 rev64 b2.4s, b2.4s; \ 97 rev64 b3.4s, b3.4s; \ 98 ext b0.16b, b0.16b, b0.16b, #8; \ 99 ext b1.16b, b1.16b, b1.16b, #8; \ 100 ext b2.16b, b2.16b, b2.16b, #8; \ 101 ext b3.16b, b3.16b, b3.16b, #8; \ 102 rev32 b0.16b, b0.16b; \ 103 rev32 b1.16b, b1.16b; \ 104 rev32 b2.16b, b2.16b; \ 105 rev32 b3.16b, b3.16b; 106 107#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 108 rev32 b0.16b, b0.16b; \ 109 rev32 b1.16b, b1.16b; \ 110 rev32 b2.16b, b2.16b; \ 111 rev32 b3.16b, b3.16b; \ 112 rev32 b4.16b, b4.16b; \ 113 rev32 b5.16b, b5.16b; \ 114 rev32 b6.16b, b6.16b; \ 115 rev32 b7.16b, b7.16b; \ 116 sm4e b0.4s, v24.4s; \ 117 sm4e b1.4s, v24.4s; \ 118 sm4e b2.4s, v24.4s; \ 119 sm4e b3.4s, v24.4s; \ 120 sm4e b4.4s, v24.4s; \ 121 sm4e b5.4s, v24.4s; \ 122 sm4e b6.4s, v24.4s; \ 123 sm4e b7.4s, v24.4s; \ 124 sm4e b0.4s, v25.4s; \ 125 sm4e b1.4s, v25.4s; \ 126 sm4e b2.4s, v25.4s; \ 127 sm4e b3.4s, v25.4s; \ 128 sm4e b4.4s, v25.4s; \ 129 sm4e b5.4s, v25.4s; \ 130 sm4e b6.4s, v25.4s; \ 131 sm4e b7.4s, v25.4s; \ 132 sm4e b0.4s, v26.4s; \ 133 sm4e b1.4s, v26.4s; \ 134 sm4e b2.4s, v26.4s; \ 135 sm4e b3.4s, v26.4s; \ 136 sm4e b4.4s, v26.4s; \ 137 sm4e b5.4s, v26.4s; \ 138 sm4e b6.4s, v26.4s; \ 139 sm4e b7.4s, v26.4s; \ 140 sm4e b0.4s, v27.4s; \ 141 sm4e b1.4s, v27.4s; \ 142 sm4e b2.4s, v27.4s; \ 143 sm4e b3.4s, v27.4s; \ 144 sm4e b4.4s, v27.4s; \ 145 sm4e b5.4s, v27.4s; \ 146 sm4e b6.4s, v27.4s; \ 147 sm4e b7.4s, v27.4s; \ 148 sm4e b0.4s, v28.4s; \ 149 sm4e b1.4s, v28.4s; \ 150 sm4e b2.4s, v28.4s; \ 151 sm4e b3.4s, v28.4s; \ 152 sm4e b4.4s, v28.4s; \ 153 sm4e b5.4s, v28.4s; \ 154 sm4e b6.4s, v28.4s; \ 155 sm4e b7.4s, v28.4s; \ 156 sm4e b0.4s, v29.4s; \ 157 sm4e b1.4s, v29.4s; \ 158 sm4e b2.4s, v29.4s; \ 159 sm4e b3.4s, v29.4s; \ 160 sm4e b4.4s, v29.4s; \ 161 sm4e b5.4s, v29.4s; \ 162 sm4e b6.4s, v29.4s; \ 163 sm4e b7.4s, v29.4s; \ 164 sm4e b0.4s, v30.4s; \ 165 sm4e b1.4s, v30.4s; \ 166 sm4e b2.4s, v30.4s; \ 167 sm4e b3.4s, v30.4s; \ 168 sm4e b4.4s, v30.4s; \ 169 sm4e b5.4s, v30.4s; \ 170 sm4e b6.4s, v30.4s; \ 171 sm4e b7.4s, v30.4s; \ 172 sm4e b0.4s, v31.4s; \ 173 sm4e b1.4s, v31.4s; \ 174 sm4e b2.4s, v31.4s; \ 175 sm4e b3.4s, v31.4s; \ 176 sm4e b4.4s, v31.4s; \ 177 sm4e b5.4s, v31.4s; \ 178 sm4e b6.4s, v31.4s; \ 179 sm4e b7.4s, v31.4s; \ 180 rev64 b0.4s, b0.4s; \ 181 rev64 b1.4s, b1.4s; \ 182 rev64 b2.4s, b2.4s; \ 183 rev64 b3.4s, b3.4s; \ 184 rev64 b4.4s, b4.4s; \ 185 rev64 b5.4s, b5.4s; \ 186 rev64 b6.4s, b6.4s; \ 187 rev64 b7.4s, b7.4s; \ 188 ext b0.16b, b0.16b, b0.16b, #8; \ 189 ext b1.16b, b1.16b, b1.16b, #8; \ 190 ext b2.16b, b2.16b, b2.16b, #8; \ 191 ext b3.16b, b3.16b, b3.16b, #8; \ 192 ext b4.16b, b4.16b, b4.16b, #8; \ 193 ext b5.16b, b5.16b, b5.16b, #8; \ 194 ext b6.16b, b6.16b, b6.16b, #8; \ 195 ext b7.16b, b7.16b, b7.16b, #8; \ 196 rev32 b0.16b, b0.16b; \ 197 rev32 b1.16b, b1.16b; \ 198 rev32 b2.16b, b2.16b; \ 199 rev32 b3.16b, b3.16b; \ 200 rev32 b4.16b, b4.16b; \ 201 rev32 b5.16b, b5.16b; \ 202 rev32 b6.16b, b6.16b; \ 203 rev32 b7.16b, b7.16b; 204 205 206.align 3 207SYM_FUNC_START(sm4_ce_expand_key) 208 /* input: 209 * x0: 128-bit key 210 * x1: rkey_enc 211 * x2: rkey_dec 212 * x3: fk array 213 * x4: ck array 214 */ 215 ld1 {v0.16b}, [x0]; 216 rev32 v0.16b, v0.16b; 217 ld1 {v1.16b}, [x3]; 218 /* load ck */ 219 ld1 {v24.16b-v27.16b}, [x4], #64; 220 ld1 {v28.16b-v31.16b}, [x4]; 221 222 /* input ^ fk */ 223 eor v0.16b, v0.16b, v1.16b; 224 225 sm4ekey v0.4s, v0.4s, v24.4s; 226 sm4ekey v1.4s, v0.4s, v25.4s; 227 sm4ekey v2.4s, v1.4s, v26.4s; 228 sm4ekey v3.4s, v2.4s, v27.4s; 229 sm4ekey v4.4s, v3.4s, v28.4s; 230 sm4ekey v5.4s, v4.4s, v29.4s; 231 sm4ekey v6.4s, v5.4s, v30.4s; 232 sm4ekey v7.4s, v6.4s, v31.4s; 233 234 st1 {v0.16b-v3.16b}, [x1], #64; 235 st1 {v4.16b-v7.16b}, [x1]; 236 rev64 v7.4s, v7.4s; 237 rev64 v6.4s, v6.4s; 238 rev64 v5.4s, v5.4s; 239 rev64 v4.4s, v4.4s; 240 rev64 v3.4s, v3.4s; 241 rev64 v2.4s, v2.4s; 242 rev64 v1.4s, v1.4s; 243 rev64 v0.4s, v0.4s; 244 ext v7.16b, v7.16b, v7.16b, #8; 245 ext v6.16b, v6.16b, v6.16b, #8; 246 ext v5.16b, v5.16b, v5.16b, #8; 247 ext v4.16b, v4.16b, v4.16b, #8; 248 ext v3.16b, v3.16b, v3.16b, #8; 249 ext v2.16b, v2.16b, v2.16b, #8; 250 ext v1.16b, v1.16b, v1.16b, #8; 251 ext v0.16b, v0.16b, v0.16b, #8; 252 st1 {v7.16b}, [x2], #16; 253 st1 {v6.16b}, [x2], #16; 254 st1 {v5.16b}, [x2], #16; 255 st1 {v4.16b}, [x2], #16; 256 st1 {v3.16b}, [x2], #16; 257 st1 {v2.16b}, [x2], #16; 258 st1 {v1.16b}, [x2], #16; 259 st1 {v0.16b}, [x2]; 260 261 ret; 262SYM_FUNC_END(sm4_ce_expand_key) 263 264.align 3 265SYM_FUNC_START(sm4_ce_crypt_block) 266 /* input: 267 * x0: round key array, CTX 268 * x1: dst 269 * x2: src 270 */ 271 PREPARE; 272 273 ld1 {v0.16b}, [x2]; 274 SM4_CRYPT_BLK(v0); 275 st1 {v0.16b}, [x1]; 276 277 ret; 278SYM_FUNC_END(sm4_ce_crypt_block) 279 280.align 3 281SYM_FUNC_START(sm4_ce_crypt) 282 /* input: 283 * x0: round key array, CTX 284 * x1: dst 285 * x2: src 286 * w3: nblocks 287 */ 288 PREPARE; 289 290.Lcrypt_loop_blk: 291 sub w3, w3, #8; 292 tbnz w3, #31, .Lcrypt_tail8; 293 294 ld1 {v0.16b-v3.16b}, [x2], #64; 295 ld1 {v4.16b-v7.16b}, [x2], #64; 296 297 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 298 299 st1 {v0.16b-v3.16b}, [x1], #64; 300 st1 {v4.16b-v7.16b}, [x1], #64; 301 302 cbz w3, .Lcrypt_end; 303 b .Lcrypt_loop_blk; 304 305.Lcrypt_tail8: 306 add w3, w3, #8; 307 cmp w3, #4; 308 blt .Lcrypt_tail4; 309 310 sub w3, w3, #4; 311 312 ld1 {v0.16b-v3.16b}, [x2], #64; 313 SM4_CRYPT_BLK4(v0, v1, v2, v3); 314 st1 {v0.16b-v3.16b}, [x1], #64; 315 316 cbz w3, .Lcrypt_end; 317 318.Lcrypt_tail4: 319 sub w3, w3, #1; 320 321 ld1 {v0.16b}, [x2], #16; 322 SM4_CRYPT_BLK(v0); 323 st1 {v0.16b}, [x1], #16; 324 325 cbnz w3, .Lcrypt_tail4; 326 327.Lcrypt_end: 328 ret; 329SYM_FUNC_END(sm4_ce_crypt) 330 331.align 3 332SYM_FUNC_START(sm4_ce_cbc_enc) 333 /* input: 334 * x0: round key array, CTX 335 * x1: dst 336 * x2: src 337 * x3: iv (big endian, 128 bit) 338 * w4: nblocks 339 */ 340 PREPARE; 341 342 ld1 {RIV.16b}, [x3]; 343 344.Lcbc_enc_loop: 345 sub w4, w4, #1; 346 347 ld1 {RTMP0.16b}, [x2], #16; 348 eor RIV.16b, RIV.16b, RTMP0.16b; 349 350 SM4_CRYPT_BLK(RIV); 351 352 st1 {RIV.16b}, [x1], #16; 353 354 cbnz w4, .Lcbc_enc_loop; 355 356 /* store new IV */ 357 st1 {RIV.16b}, [x3]; 358 359 ret; 360SYM_FUNC_END(sm4_ce_cbc_enc) 361 362.align 3 363SYM_FUNC_START(sm4_ce_cbc_dec) 364 /* input: 365 * x0: round key array, CTX 366 * x1: dst 367 * x2: src 368 * x3: iv (big endian, 128 bit) 369 * w4: nblocks 370 */ 371 PREPARE; 372 373 ld1 {RIV.16b}, [x3]; 374 375.Lcbc_loop_blk: 376 sub w4, w4, #8; 377 tbnz w4, #31, .Lcbc_tail8; 378 379 ld1 {v0.16b-v3.16b}, [x2], #64; 380 ld1 {v4.16b-v7.16b}, [x2]; 381 382 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 383 384 sub x2, x2, #64; 385 eor v0.16b, v0.16b, RIV.16b; 386 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 387 eor v1.16b, v1.16b, RTMP0.16b; 388 eor v2.16b, v2.16b, RTMP1.16b; 389 eor v3.16b, v3.16b, RTMP2.16b; 390 st1 {v0.16b-v3.16b}, [x1], #64; 391 392 eor v4.16b, v4.16b, RTMP3.16b; 393 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 394 eor v5.16b, v5.16b, RTMP0.16b; 395 eor v6.16b, v6.16b, RTMP1.16b; 396 eor v7.16b, v7.16b, RTMP2.16b; 397 398 mov RIV.16b, RTMP3.16b; 399 st1 {v4.16b-v7.16b}, [x1], #64; 400 401 cbz w4, .Lcbc_end; 402 b .Lcbc_loop_blk; 403 404.Lcbc_tail8: 405 add w4, w4, #8; 406 cmp w4, #4; 407 blt .Lcbc_tail4; 408 409 sub w4, w4, #4; 410 411 ld1 {v0.16b-v3.16b}, [x2]; 412 413 SM4_CRYPT_BLK4(v0, v1, v2, v3); 414 415 eor v0.16b, v0.16b, RIV.16b; 416 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 417 eor v1.16b, v1.16b, RTMP0.16b; 418 eor v2.16b, v2.16b, RTMP1.16b; 419 eor v3.16b, v3.16b, RTMP2.16b; 420 421 mov RIV.16b, RTMP3.16b; 422 st1 {v0.16b-v3.16b}, [x1], #64; 423 424 cbz w4, .Lcbc_end; 425 426.Lcbc_tail4: 427 sub w4, w4, #1; 428 429 ld1 {v0.16b}, [x2]; 430 431 SM4_CRYPT_BLK(v0); 432 433 eor v0.16b, v0.16b, RIV.16b; 434 ld1 {RIV.16b}, [x2], #16; 435 st1 {v0.16b}, [x1], #16; 436 437 cbnz w4, .Lcbc_tail4; 438 439.Lcbc_end: 440 /* store new IV */ 441 st1 {RIV.16b}, [x3]; 442 443 ret; 444SYM_FUNC_END(sm4_ce_cbc_dec) 445 446.align 3 447SYM_FUNC_START(sm4_ce_cfb_enc) 448 /* input: 449 * x0: round key array, CTX 450 * x1: dst 451 * x2: src 452 * x3: iv (big endian, 128 bit) 453 * w4: nblocks 454 */ 455 PREPARE; 456 457 ld1 {RIV.16b}, [x3]; 458 459.Lcfb_enc_loop: 460 sub w4, w4, #1; 461 462 SM4_CRYPT_BLK(RIV); 463 464 ld1 {RTMP0.16b}, [x2], #16; 465 eor RIV.16b, RIV.16b, RTMP0.16b; 466 st1 {RIV.16b}, [x1], #16; 467 468 cbnz w4, .Lcfb_enc_loop; 469 470 /* store new IV */ 471 st1 {RIV.16b}, [x3]; 472 473 ret; 474SYM_FUNC_END(sm4_ce_cfb_enc) 475 476.align 3 477SYM_FUNC_START(sm4_ce_cfb_dec) 478 /* input: 479 * x0: round key array, CTX 480 * x1: dst 481 * x2: src 482 * x3: iv (big endian, 128 bit) 483 * w4: nblocks 484 */ 485 PREPARE; 486 487 ld1 {v0.16b}, [x3]; 488 489.Lcfb_loop_blk: 490 sub w4, w4, #8; 491 tbnz w4, #31, .Lcfb_tail8; 492 493 ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; 494 ld1 {v4.16b-v7.16b}, [x2]; 495 496 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 497 498 sub x2, x2, #48; 499 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 500 eor v0.16b, v0.16b, RTMP0.16b; 501 eor v1.16b, v1.16b, RTMP1.16b; 502 eor v2.16b, v2.16b, RTMP2.16b; 503 eor v3.16b, v3.16b, RTMP3.16b; 504 st1 {v0.16b-v3.16b}, [x1], #64; 505 506 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 507 eor v4.16b, v4.16b, RTMP0.16b; 508 eor v5.16b, v5.16b, RTMP1.16b; 509 eor v6.16b, v6.16b, RTMP2.16b; 510 eor v7.16b, v7.16b, RTMP3.16b; 511 st1 {v4.16b-v7.16b}, [x1], #64; 512 513 mov v0.16b, RTMP3.16b; 514 515 cbz w4, .Lcfb_end; 516 b .Lcfb_loop_blk; 517 518.Lcfb_tail8: 519 add w4, w4, #8; 520 cmp w4, #4; 521 blt .Lcfb_tail4; 522 523 sub w4, w4, #4; 524 525 ld1 {v1.16b, v2.16b, v3.16b}, [x2]; 526 527 SM4_CRYPT_BLK4(v0, v1, v2, v3); 528 529 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 530 eor v0.16b, v0.16b, RTMP0.16b; 531 eor v1.16b, v1.16b, RTMP1.16b; 532 eor v2.16b, v2.16b, RTMP2.16b; 533 eor v3.16b, v3.16b, RTMP3.16b; 534 st1 {v0.16b-v3.16b}, [x1], #64; 535 536 mov v0.16b, RTMP3.16b; 537 538 cbz w4, .Lcfb_end; 539 540.Lcfb_tail4: 541 sub w4, w4, #1; 542 543 SM4_CRYPT_BLK(v0); 544 545 ld1 {RTMP0.16b}, [x2], #16; 546 eor v0.16b, v0.16b, RTMP0.16b; 547 st1 {v0.16b}, [x1], #16; 548 549 mov v0.16b, RTMP0.16b; 550 551 cbnz w4, .Lcfb_tail4; 552 553.Lcfb_end: 554 /* store new IV */ 555 st1 {v0.16b}, [x3]; 556 557 ret; 558SYM_FUNC_END(sm4_ce_cfb_dec) 559 560.align 3 561SYM_FUNC_START(sm4_ce_ctr_enc) 562 /* input: 563 * x0: round key array, CTX 564 * x1: dst 565 * x2: src 566 * x3: ctr (big endian, 128 bit) 567 * w4: nblocks 568 */ 569 PREPARE; 570 571 ldp x7, x8, [x3]; 572 rev x7, x7; 573 rev x8, x8; 574 575.Lctr_loop_blk: 576 sub w4, w4, #8; 577 tbnz w4, #31, .Lctr_tail8; 578 579#define inc_le128(vctr) \ 580 mov vctr.d[1], x8; \ 581 mov vctr.d[0], x7; \ 582 adds x8, x8, #1; \ 583 adc x7, x7, xzr; \ 584 rev64 vctr.16b, vctr.16b; 585 586 /* construct CTRs */ 587 inc_le128(v0); /* +0 */ 588 inc_le128(v1); /* +1 */ 589 inc_le128(v2); /* +2 */ 590 inc_le128(v3); /* +3 */ 591 inc_le128(v4); /* +4 */ 592 inc_le128(v5); /* +5 */ 593 inc_le128(v6); /* +6 */ 594 inc_le128(v7); /* +7 */ 595 596 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 597 598 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 599 eor v0.16b, v0.16b, RTMP0.16b; 600 eor v1.16b, v1.16b, RTMP1.16b; 601 eor v2.16b, v2.16b, RTMP2.16b; 602 eor v3.16b, v3.16b, RTMP3.16b; 603 st1 {v0.16b-v3.16b}, [x1], #64; 604 605 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 606 eor v4.16b, v4.16b, RTMP0.16b; 607 eor v5.16b, v5.16b, RTMP1.16b; 608 eor v6.16b, v6.16b, RTMP2.16b; 609 eor v7.16b, v7.16b, RTMP3.16b; 610 st1 {v4.16b-v7.16b}, [x1], #64; 611 612 cbz w4, .Lctr_end; 613 b .Lctr_loop_blk; 614 615.Lctr_tail8: 616 add w4, w4, #8; 617 cmp w4, #4; 618 blt .Lctr_tail4; 619 620 sub w4, w4, #4; 621 622 /* construct CTRs */ 623 inc_le128(v0); /* +0 */ 624 inc_le128(v1); /* +1 */ 625 inc_le128(v2); /* +2 */ 626 inc_le128(v3); /* +3 */ 627 628 SM4_CRYPT_BLK4(v0, v1, v2, v3); 629 630 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 631 eor v0.16b, v0.16b, RTMP0.16b; 632 eor v1.16b, v1.16b, RTMP1.16b; 633 eor v2.16b, v2.16b, RTMP2.16b; 634 eor v3.16b, v3.16b, RTMP3.16b; 635 st1 {v0.16b-v3.16b}, [x1], #64; 636 637 cbz w4, .Lctr_end; 638 639.Lctr_tail4: 640 sub w4, w4, #1; 641 642 /* construct CTRs */ 643 inc_le128(v0); 644 645 SM4_CRYPT_BLK(v0); 646 647 ld1 {RTMP0.16b}, [x2], #16; 648 eor v0.16b, v0.16b, RTMP0.16b; 649 st1 {v0.16b}, [x1], #16; 650 651 cbnz w4, .Lctr_tail4; 652 653.Lctr_end: 654 /* store new CTR */ 655 rev x7, x7; 656 rev x8, x8; 657 stp x7, x8, [x3]; 658 659 ret; 660SYM_FUNC_END(sm4_ce_ctr_enc)