chacha-neon-core.S (19011B)
1/* 2 * ChaCha/XChaCha NEON helper functions 3 * 4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 * 10 * Originally based on: 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 12 * 13 * Copyright (C) 2015 Martin Willi 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License as published by 17 * the Free Software Foundation; either version 2 of the License, or 18 * (at your option) any later version. 19 */ 20 21#include <linux/linkage.h> 22#include <asm/assembler.h> 23#include <asm/cache.h> 24 25 .text 26 .align 6 27 28/* 29 * chacha_permute - permute one block 30 * 31 * Permute one 64-byte block where the state matrix is stored in the four NEON 32 * registers v0-v3. It performs matrix operations on four words in parallel, 33 * but requires shuffling to rearrange the words after each round. 34 * 35 * The round count is given in w3. 36 * 37 * Clobbers: w3, x10, v4, v12 38 */ 39SYM_FUNC_START_LOCAL(chacha_permute) 40 41 adr_l x10, ROT8 42 ld1 {v12.4s}, [x10] 43 44.Ldoubleround: 45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 46 add v0.4s, v0.4s, v1.4s 47 eor v3.16b, v3.16b, v0.16b 48 rev32 v3.8h, v3.8h 49 50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 51 add v2.4s, v2.4s, v3.4s 52 eor v4.16b, v1.16b, v2.16b 53 shl v1.4s, v4.4s, #12 54 sri v1.4s, v4.4s, #20 55 56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 57 add v0.4s, v0.4s, v1.4s 58 eor v3.16b, v3.16b, v0.16b 59 tbl v3.16b, {v3.16b}, v12.16b 60 61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 62 add v2.4s, v2.4s, v3.4s 63 eor v4.16b, v1.16b, v2.16b 64 shl v1.4s, v4.4s, #7 65 sri v1.4s, v4.4s, #25 66 67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 68 ext v1.16b, v1.16b, v1.16b, #4 69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 70 ext v2.16b, v2.16b, v2.16b, #8 71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 72 ext v3.16b, v3.16b, v3.16b, #12 73 74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 75 add v0.4s, v0.4s, v1.4s 76 eor v3.16b, v3.16b, v0.16b 77 rev32 v3.8h, v3.8h 78 79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 80 add v2.4s, v2.4s, v3.4s 81 eor v4.16b, v1.16b, v2.16b 82 shl v1.4s, v4.4s, #12 83 sri v1.4s, v4.4s, #20 84 85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 86 add v0.4s, v0.4s, v1.4s 87 eor v3.16b, v3.16b, v0.16b 88 tbl v3.16b, {v3.16b}, v12.16b 89 90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 91 add v2.4s, v2.4s, v3.4s 92 eor v4.16b, v1.16b, v2.16b 93 shl v1.4s, v4.4s, #7 94 sri v1.4s, v4.4s, #25 95 96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 97 ext v1.16b, v1.16b, v1.16b, #12 98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 99 ext v2.16b, v2.16b, v2.16b, #8 100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 101 ext v3.16b, v3.16b, v3.16b, #4 102 103 subs w3, w3, #2 104 b.ne .Ldoubleround 105 106 ret 107SYM_FUNC_END(chacha_permute) 108 109SYM_FUNC_START(chacha_block_xor_neon) 110 // x0: Input state matrix, s 111 // x1: 1 data block output, o 112 // x2: 1 data block input, i 113 // w3: nrounds 114 115 stp x29, x30, [sp, #-16]! 116 mov x29, sp 117 118 // x0..3 = s0..3 119 ld1 {v0.4s-v3.4s}, [x0] 120 ld1 {v8.4s-v11.4s}, [x0] 121 122 bl chacha_permute 123 124 ld1 {v4.16b-v7.16b}, [x2] 125 126 // o0 = i0 ^ (x0 + s0) 127 add v0.4s, v0.4s, v8.4s 128 eor v0.16b, v0.16b, v4.16b 129 130 // o1 = i1 ^ (x1 + s1) 131 add v1.4s, v1.4s, v9.4s 132 eor v1.16b, v1.16b, v5.16b 133 134 // o2 = i2 ^ (x2 + s2) 135 add v2.4s, v2.4s, v10.4s 136 eor v2.16b, v2.16b, v6.16b 137 138 // o3 = i3 ^ (x3 + s3) 139 add v3.4s, v3.4s, v11.4s 140 eor v3.16b, v3.16b, v7.16b 141 142 st1 {v0.16b-v3.16b}, [x1] 143 144 ldp x29, x30, [sp], #16 145 ret 146SYM_FUNC_END(chacha_block_xor_neon) 147 148SYM_FUNC_START(hchacha_block_neon) 149 // x0: Input state matrix, s 150 // x1: output (8 32-bit words) 151 // w2: nrounds 152 153 stp x29, x30, [sp, #-16]! 154 mov x29, sp 155 156 ld1 {v0.4s-v3.4s}, [x0] 157 158 mov w3, w2 159 bl chacha_permute 160 161 st1 {v0.4s}, [x1], #16 162 st1 {v3.4s}, [x1] 163 164 ldp x29, x30, [sp], #16 165 ret 166SYM_FUNC_END(hchacha_block_neon) 167 168 a0 .req w12 169 a1 .req w13 170 a2 .req w14 171 a3 .req w15 172 a4 .req w16 173 a5 .req w17 174 a6 .req w19 175 a7 .req w20 176 a8 .req w21 177 a9 .req w22 178 a10 .req w23 179 a11 .req w24 180 a12 .req w25 181 a13 .req w26 182 a14 .req w27 183 a15 .req w28 184 185 .align 6 186SYM_FUNC_START(chacha_4block_xor_neon) 187 frame_push 10 188 189 // x0: Input state matrix, s 190 // x1: 4 data blocks output, o 191 // x2: 4 data blocks input, i 192 // w3: nrounds 193 // x4: byte count 194 195 adr_l x10, .Lpermute 196 and x5, x4, #63 197 add x10, x10, x5 198 199 // 200 // This function encrypts four consecutive ChaCha blocks by loading 201 // the state matrix in NEON registers four times. The algorithm performs 202 // each operation on the corresponding word of each state matrix, hence 203 // requires no word shuffling. For final XORing step we transpose the 204 // matrix by interleaving 32- and then 64-bit words, which allows us to 205 // do XOR in NEON registers. 206 // 207 // At the same time, a fifth block is encrypted in parallel using 208 // scalar registers 209 // 210 adr_l x9, CTRINC // ... and ROT8 211 ld1 {v30.4s-v31.4s}, [x9] 212 213 // x0..15[0-3] = s0..3[0..3] 214 add x8, x0, #16 215 ld4r { v0.4s- v3.4s}, [x0] 216 ld4r { v4.4s- v7.4s}, [x8], #16 217 ld4r { v8.4s-v11.4s}, [x8], #16 218 ld4r {v12.4s-v15.4s}, [x8] 219 220 mov a0, v0.s[0] 221 mov a1, v1.s[0] 222 mov a2, v2.s[0] 223 mov a3, v3.s[0] 224 mov a4, v4.s[0] 225 mov a5, v5.s[0] 226 mov a6, v6.s[0] 227 mov a7, v7.s[0] 228 mov a8, v8.s[0] 229 mov a9, v9.s[0] 230 mov a10, v10.s[0] 231 mov a11, v11.s[0] 232 mov a12, v12.s[0] 233 mov a13, v13.s[0] 234 mov a14, v14.s[0] 235 mov a15, v15.s[0] 236 237 // x12 += counter values 1-4 238 add v12.4s, v12.4s, v30.4s 239 240.Ldoubleround4: 241 // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 242 // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 243 // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 244 // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 245 add v0.4s, v0.4s, v4.4s 246 add a0, a0, a4 247 add v1.4s, v1.4s, v5.4s 248 add a1, a1, a5 249 add v2.4s, v2.4s, v6.4s 250 add a2, a2, a6 251 add v3.4s, v3.4s, v7.4s 252 add a3, a3, a7 253 254 eor v12.16b, v12.16b, v0.16b 255 eor a12, a12, a0 256 eor v13.16b, v13.16b, v1.16b 257 eor a13, a13, a1 258 eor v14.16b, v14.16b, v2.16b 259 eor a14, a14, a2 260 eor v15.16b, v15.16b, v3.16b 261 eor a15, a15, a3 262 263 rev32 v12.8h, v12.8h 264 ror a12, a12, #16 265 rev32 v13.8h, v13.8h 266 ror a13, a13, #16 267 rev32 v14.8h, v14.8h 268 ror a14, a14, #16 269 rev32 v15.8h, v15.8h 270 ror a15, a15, #16 271 272 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 273 // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 274 // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 275 // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 276 add v8.4s, v8.4s, v12.4s 277 add a8, a8, a12 278 add v9.4s, v9.4s, v13.4s 279 add a9, a9, a13 280 add v10.4s, v10.4s, v14.4s 281 add a10, a10, a14 282 add v11.4s, v11.4s, v15.4s 283 add a11, a11, a15 284 285 eor v16.16b, v4.16b, v8.16b 286 eor a4, a4, a8 287 eor v17.16b, v5.16b, v9.16b 288 eor a5, a5, a9 289 eor v18.16b, v6.16b, v10.16b 290 eor a6, a6, a10 291 eor v19.16b, v7.16b, v11.16b 292 eor a7, a7, a11 293 294 shl v4.4s, v16.4s, #12 295 shl v5.4s, v17.4s, #12 296 shl v6.4s, v18.4s, #12 297 shl v7.4s, v19.4s, #12 298 299 sri v4.4s, v16.4s, #20 300 ror a4, a4, #20 301 sri v5.4s, v17.4s, #20 302 ror a5, a5, #20 303 sri v6.4s, v18.4s, #20 304 ror a6, a6, #20 305 sri v7.4s, v19.4s, #20 306 ror a7, a7, #20 307 308 // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 309 // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 310 // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 311 // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 312 add v0.4s, v0.4s, v4.4s 313 add a0, a0, a4 314 add v1.4s, v1.4s, v5.4s 315 add a1, a1, a5 316 add v2.4s, v2.4s, v6.4s 317 add a2, a2, a6 318 add v3.4s, v3.4s, v7.4s 319 add a3, a3, a7 320 321 eor v12.16b, v12.16b, v0.16b 322 eor a12, a12, a0 323 eor v13.16b, v13.16b, v1.16b 324 eor a13, a13, a1 325 eor v14.16b, v14.16b, v2.16b 326 eor a14, a14, a2 327 eor v15.16b, v15.16b, v3.16b 328 eor a15, a15, a3 329 330 tbl v12.16b, {v12.16b}, v31.16b 331 ror a12, a12, #24 332 tbl v13.16b, {v13.16b}, v31.16b 333 ror a13, a13, #24 334 tbl v14.16b, {v14.16b}, v31.16b 335 ror a14, a14, #24 336 tbl v15.16b, {v15.16b}, v31.16b 337 ror a15, a15, #24 338 339 // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 340 // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 341 // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 342 // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 343 add v8.4s, v8.4s, v12.4s 344 add a8, a8, a12 345 add v9.4s, v9.4s, v13.4s 346 add a9, a9, a13 347 add v10.4s, v10.4s, v14.4s 348 add a10, a10, a14 349 add v11.4s, v11.4s, v15.4s 350 add a11, a11, a15 351 352 eor v16.16b, v4.16b, v8.16b 353 eor a4, a4, a8 354 eor v17.16b, v5.16b, v9.16b 355 eor a5, a5, a9 356 eor v18.16b, v6.16b, v10.16b 357 eor a6, a6, a10 358 eor v19.16b, v7.16b, v11.16b 359 eor a7, a7, a11 360 361 shl v4.4s, v16.4s, #7 362 shl v5.4s, v17.4s, #7 363 shl v6.4s, v18.4s, #7 364 shl v7.4s, v19.4s, #7 365 366 sri v4.4s, v16.4s, #25 367 ror a4, a4, #25 368 sri v5.4s, v17.4s, #25 369 ror a5, a5, #25 370 sri v6.4s, v18.4s, #25 371 ror a6, a6, #25 372 sri v7.4s, v19.4s, #25 373 ror a7, a7, #25 374 375 // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 376 // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 377 // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 378 // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 379 add v0.4s, v0.4s, v5.4s 380 add a0, a0, a5 381 add v1.4s, v1.4s, v6.4s 382 add a1, a1, a6 383 add v2.4s, v2.4s, v7.4s 384 add a2, a2, a7 385 add v3.4s, v3.4s, v4.4s 386 add a3, a3, a4 387 388 eor v15.16b, v15.16b, v0.16b 389 eor a15, a15, a0 390 eor v12.16b, v12.16b, v1.16b 391 eor a12, a12, a1 392 eor v13.16b, v13.16b, v2.16b 393 eor a13, a13, a2 394 eor v14.16b, v14.16b, v3.16b 395 eor a14, a14, a3 396 397 rev32 v15.8h, v15.8h 398 ror a15, a15, #16 399 rev32 v12.8h, v12.8h 400 ror a12, a12, #16 401 rev32 v13.8h, v13.8h 402 ror a13, a13, #16 403 rev32 v14.8h, v14.8h 404 ror a14, a14, #16 405 406 // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 407 // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 408 // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 409 // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 410 add v10.4s, v10.4s, v15.4s 411 add a10, a10, a15 412 add v11.4s, v11.4s, v12.4s 413 add a11, a11, a12 414 add v8.4s, v8.4s, v13.4s 415 add a8, a8, a13 416 add v9.4s, v9.4s, v14.4s 417 add a9, a9, a14 418 419 eor v16.16b, v5.16b, v10.16b 420 eor a5, a5, a10 421 eor v17.16b, v6.16b, v11.16b 422 eor a6, a6, a11 423 eor v18.16b, v7.16b, v8.16b 424 eor a7, a7, a8 425 eor v19.16b, v4.16b, v9.16b 426 eor a4, a4, a9 427 428 shl v5.4s, v16.4s, #12 429 shl v6.4s, v17.4s, #12 430 shl v7.4s, v18.4s, #12 431 shl v4.4s, v19.4s, #12 432 433 sri v5.4s, v16.4s, #20 434 ror a5, a5, #20 435 sri v6.4s, v17.4s, #20 436 ror a6, a6, #20 437 sri v7.4s, v18.4s, #20 438 ror a7, a7, #20 439 sri v4.4s, v19.4s, #20 440 ror a4, a4, #20 441 442 // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 443 // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 444 // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 445 // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 446 add v0.4s, v0.4s, v5.4s 447 add a0, a0, a5 448 add v1.4s, v1.4s, v6.4s 449 add a1, a1, a6 450 add v2.4s, v2.4s, v7.4s 451 add a2, a2, a7 452 add v3.4s, v3.4s, v4.4s 453 add a3, a3, a4 454 455 eor v15.16b, v15.16b, v0.16b 456 eor a15, a15, a0 457 eor v12.16b, v12.16b, v1.16b 458 eor a12, a12, a1 459 eor v13.16b, v13.16b, v2.16b 460 eor a13, a13, a2 461 eor v14.16b, v14.16b, v3.16b 462 eor a14, a14, a3 463 464 tbl v15.16b, {v15.16b}, v31.16b 465 ror a15, a15, #24 466 tbl v12.16b, {v12.16b}, v31.16b 467 ror a12, a12, #24 468 tbl v13.16b, {v13.16b}, v31.16b 469 ror a13, a13, #24 470 tbl v14.16b, {v14.16b}, v31.16b 471 ror a14, a14, #24 472 473 // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 474 // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 475 // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 476 // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 477 add v10.4s, v10.4s, v15.4s 478 add a10, a10, a15 479 add v11.4s, v11.4s, v12.4s 480 add a11, a11, a12 481 add v8.4s, v8.4s, v13.4s 482 add a8, a8, a13 483 add v9.4s, v9.4s, v14.4s 484 add a9, a9, a14 485 486 eor v16.16b, v5.16b, v10.16b 487 eor a5, a5, a10 488 eor v17.16b, v6.16b, v11.16b 489 eor a6, a6, a11 490 eor v18.16b, v7.16b, v8.16b 491 eor a7, a7, a8 492 eor v19.16b, v4.16b, v9.16b 493 eor a4, a4, a9 494 495 shl v5.4s, v16.4s, #7 496 shl v6.4s, v17.4s, #7 497 shl v7.4s, v18.4s, #7 498 shl v4.4s, v19.4s, #7 499 500 sri v5.4s, v16.4s, #25 501 ror a5, a5, #25 502 sri v6.4s, v17.4s, #25 503 ror a6, a6, #25 504 sri v7.4s, v18.4s, #25 505 ror a7, a7, #25 506 sri v4.4s, v19.4s, #25 507 ror a4, a4, #25 508 509 subs w3, w3, #2 510 b.ne .Ldoubleround4 511 512 ld4r {v16.4s-v19.4s}, [x0], #16 513 ld4r {v20.4s-v23.4s}, [x0], #16 514 515 // x12 += counter values 0-3 516 add v12.4s, v12.4s, v30.4s 517 518 // x0[0-3] += s0[0] 519 // x1[0-3] += s0[1] 520 // x2[0-3] += s0[2] 521 // x3[0-3] += s0[3] 522 add v0.4s, v0.4s, v16.4s 523 mov w6, v16.s[0] 524 mov w7, v17.s[0] 525 add v1.4s, v1.4s, v17.4s 526 mov w8, v18.s[0] 527 mov w9, v19.s[0] 528 add v2.4s, v2.4s, v18.4s 529 add a0, a0, w6 530 add a1, a1, w7 531 add v3.4s, v3.4s, v19.4s 532 add a2, a2, w8 533 add a3, a3, w9 534CPU_BE( rev a0, a0 ) 535CPU_BE( rev a1, a1 ) 536CPU_BE( rev a2, a2 ) 537CPU_BE( rev a3, a3 ) 538 539 ld4r {v24.4s-v27.4s}, [x0], #16 540 ld4r {v28.4s-v31.4s}, [x0] 541 542 // x4[0-3] += s1[0] 543 // x5[0-3] += s1[1] 544 // x6[0-3] += s1[2] 545 // x7[0-3] += s1[3] 546 add v4.4s, v4.4s, v20.4s 547 mov w6, v20.s[0] 548 mov w7, v21.s[0] 549 add v5.4s, v5.4s, v21.4s 550 mov w8, v22.s[0] 551 mov w9, v23.s[0] 552 add v6.4s, v6.4s, v22.4s 553 add a4, a4, w6 554 add a5, a5, w7 555 add v7.4s, v7.4s, v23.4s 556 add a6, a6, w8 557 add a7, a7, w9 558CPU_BE( rev a4, a4 ) 559CPU_BE( rev a5, a5 ) 560CPU_BE( rev a6, a6 ) 561CPU_BE( rev a7, a7 ) 562 563 // x8[0-3] += s2[0] 564 // x9[0-3] += s2[1] 565 // x10[0-3] += s2[2] 566 // x11[0-3] += s2[3] 567 add v8.4s, v8.4s, v24.4s 568 mov w6, v24.s[0] 569 mov w7, v25.s[0] 570 add v9.4s, v9.4s, v25.4s 571 mov w8, v26.s[0] 572 mov w9, v27.s[0] 573 add v10.4s, v10.4s, v26.4s 574 add a8, a8, w6 575 add a9, a9, w7 576 add v11.4s, v11.4s, v27.4s 577 add a10, a10, w8 578 add a11, a11, w9 579CPU_BE( rev a8, a8 ) 580CPU_BE( rev a9, a9 ) 581CPU_BE( rev a10, a10 ) 582CPU_BE( rev a11, a11 ) 583 584 // x12[0-3] += s3[0] 585 // x13[0-3] += s3[1] 586 // x14[0-3] += s3[2] 587 // x15[0-3] += s3[3] 588 add v12.4s, v12.4s, v28.4s 589 mov w6, v28.s[0] 590 mov w7, v29.s[0] 591 add v13.4s, v13.4s, v29.4s 592 mov w8, v30.s[0] 593 mov w9, v31.s[0] 594 add v14.4s, v14.4s, v30.4s 595 add a12, a12, w6 596 add a13, a13, w7 597 add v15.4s, v15.4s, v31.4s 598 add a14, a14, w8 599 add a15, a15, w9 600CPU_BE( rev a12, a12 ) 601CPU_BE( rev a13, a13 ) 602CPU_BE( rev a14, a14 ) 603CPU_BE( rev a15, a15 ) 604 605 // interleave 32-bit words in state n, n+1 606 ldp w6, w7, [x2], #64 607 zip1 v16.4s, v0.4s, v1.4s 608 ldp w8, w9, [x2, #-56] 609 eor a0, a0, w6 610 zip2 v17.4s, v0.4s, v1.4s 611 eor a1, a1, w7 612 zip1 v18.4s, v2.4s, v3.4s 613 eor a2, a2, w8 614 zip2 v19.4s, v2.4s, v3.4s 615 eor a3, a3, w9 616 ldp w6, w7, [x2, #-48] 617 zip1 v20.4s, v4.4s, v5.4s 618 ldp w8, w9, [x2, #-40] 619 eor a4, a4, w6 620 zip2 v21.4s, v4.4s, v5.4s 621 eor a5, a5, w7 622 zip1 v22.4s, v6.4s, v7.4s 623 eor a6, a6, w8 624 zip2 v23.4s, v6.4s, v7.4s 625 eor a7, a7, w9 626 ldp w6, w7, [x2, #-32] 627 zip1 v24.4s, v8.4s, v9.4s 628 ldp w8, w9, [x2, #-24] 629 eor a8, a8, w6 630 zip2 v25.4s, v8.4s, v9.4s 631 eor a9, a9, w7 632 zip1 v26.4s, v10.4s, v11.4s 633 eor a10, a10, w8 634 zip2 v27.4s, v10.4s, v11.4s 635 eor a11, a11, w9 636 ldp w6, w7, [x2, #-16] 637 zip1 v28.4s, v12.4s, v13.4s 638 ldp w8, w9, [x2, #-8] 639 eor a12, a12, w6 640 zip2 v29.4s, v12.4s, v13.4s 641 eor a13, a13, w7 642 zip1 v30.4s, v14.4s, v15.4s 643 eor a14, a14, w8 644 zip2 v31.4s, v14.4s, v15.4s 645 eor a15, a15, w9 646 647 add x3, x2, x4 648 sub x3, x3, #128 // start of last block 649 650 subs x5, x4, #128 651 csel x2, x2, x3, ge 652 653 // interleave 64-bit words in state n, n+2 654 zip1 v0.2d, v16.2d, v18.2d 655 zip2 v4.2d, v16.2d, v18.2d 656 stp a0, a1, [x1], #64 657 zip1 v8.2d, v17.2d, v19.2d 658 zip2 v12.2d, v17.2d, v19.2d 659 stp a2, a3, [x1, #-56] 660 661 subs x6, x4, #192 662 ld1 {v16.16b-v19.16b}, [x2], #64 663 csel x2, x2, x3, ge 664 665 zip1 v1.2d, v20.2d, v22.2d 666 zip2 v5.2d, v20.2d, v22.2d 667 stp a4, a5, [x1, #-48] 668 zip1 v9.2d, v21.2d, v23.2d 669 zip2 v13.2d, v21.2d, v23.2d 670 stp a6, a7, [x1, #-40] 671 672 subs x7, x4, #256 673 ld1 {v20.16b-v23.16b}, [x2], #64 674 csel x2, x2, x3, ge 675 676 zip1 v2.2d, v24.2d, v26.2d 677 zip2 v6.2d, v24.2d, v26.2d 678 stp a8, a9, [x1, #-32] 679 zip1 v10.2d, v25.2d, v27.2d 680 zip2 v14.2d, v25.2d, v27.2d 681 stp a10, a11, [x1, #-24] 682 683 subs x8, x4, #320 684 ld1 {v24.16b-v27.16b}, [x2], #64 685 csel x2, x2, x3, ge 686 687 zip1 v3.2d, v28.2d, v30.2d 688 zip2 v7.2d, v28.2d, v30.2d 689 stp a12, a13, [x1, #-16] 690 zip1 v11.2d, v29.2d, v31.2d 691 zip2 v15.2d, v29.2d, v31.2d 692 stp a14, a15, [x1, #-8] 693 694 tbnz x5, #63, .Lt128 695 ld1 {v28.16b-v31.16b}, [x2] 696 697 // xor with corresponding input, write to output 698 eor v16.16b, v16.16b, v0.16b 699 eor v17.16b, v17.16b, v1.16b 700 eor v18.16b, v18.16b, v2.16b 701 eor v19.16b, v19.16b, v3.16b 702 703 tbnz x6, #63, .Lt192 704 705 eor v20.16b, v20.16b, v4.16b 706 eor v21.16b, v21.16b, v5.16b 707 eor v22.16b, v22.16b, v6.16b 708 eor v23.16b, v23.16b, v7.16b 709 710 st1 {v16.16b-v19.16b}, [x1], #64 711 tbnz x7, #63, .Lt256 712 713 eor v24.16b, v24.16b, v8.16b 714 eor v25.16b, v25.16b, v9.16b 715 eor v26.16b, v26.16b, v10.16b 716 eor v27.16b, v27.16b, v11.16b 717 718 st1 {v20.16b-v23.16b}, [x1], #64 719 tbnz x8, #63, .Lt320 720 721 eor v28.16b, v28.16b, v12.16b 722 eor v29.16b, v29.16b, v13.16b 723 eor v30.16b, v30.16b, v14.16b 724 eor v31.16b, v31.16b, v15.16b 725 726 st1 {v24.16b-v27.16b}, [x1], #64 727 st1 {v28.16b-v31.16b}, [x1] 728 729.Lout: frame_pop 730 ret 731 732 // fewer than 192 bytes of in/output 733.Lt192: cbz x5, 1f // exactly 128 bytes? 734 ld1 {v28.16b-v31.16b}, [x10] 735 add x5, x5, x1 736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b 737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b 738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b 739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b 740 7410: eor v20.16b, v20.16b, v28.16b 742 eor v21.16b, v21.16b, v29.16b 743 eor v22.16b, v22.16b, v30.16b 744 eor v23.16b, v23.16b, v31.16b 745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores 7461: st1 {v16.16b-v19.16b}, [x1] 747 b .Lout 748 749 // fewer than 128 bytes of in/output 750.Lt128: ld1 {v28.16b-v31.16b}, [x10] 751 add x5, x5, x1 752 sub x1, x1, #64 753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b 754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b 755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b 756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b 757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block 758 b 0b 759 760 // fewer than 256 bytes of in/output 761.Lt256: cbz x6, 2f // exactly 192 bytes? 762 ld1 {v4.16b-v7.16b}, [x10] 763 add x6, x6, x1 764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b 765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b 766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b 767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b 768 769 eor v28.16b, v28.16b, v0.16b 770 eor v29.16b, v29.16b, v1.16b 771 eor v30.16b, v30.16b, v2.16b 772 eor v31.16b, v31.16b, v3.16b 773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores 7742: st1 {v20.16b-v23.16b}, [x1] 775 b .Lout 776 777 // fewer than 320 bytes of in/output 778.Lt320: cbz x7, 3f // exactly 256 bytes? 779 ld1 {v4.16b-v7.16b}, [x10] 780 add x7, x7, x1 781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b 782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b 783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b 784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b 785 786 eor v28.16b, v28.16b, v0.16b 787 eor v29.16b, v29.16b, v1.16b 788 eor v30.16b, v30.16b, v2.16b 789 eor v31.16b, v31.16b, v3.16b 790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores 7913: st1 {v24.16b-v27.16b}, [x1] 792 b .Lout 793SYM_FUNC_END(chacha_4block_xor_neon) 794 795 .section ".rodata", "a", %progbits 796 .align L1_CACHE_SHIFT 797.Lpermute: 798 .set .Li, 0 799 .rept 128 800 .byte (.Li - 64) 801 .set .Li, .Li + 1 802 .endr 803 804CTRINC: .word 1, 2, 3, 4 805ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f