chacha-scalar-core.S (10472B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Copyright (C) 2018 Google, Inc. 4 */ 5 6#include <linux/linkage.h> 7#include <asm/assembler.h> 8 9/* 10 * Design notes: 11 * 12 * 16 registers would be needed to hold the state matrix, but only 14 are 13 * available because 'sp' and 'pc' cannot be used. So we spill the elements 14 * (x8, x9) to the stack and swap them out with (x10, x11). This adds one 15 * 'ldrd' and one 'strd' instruction per round. 16 * 17 * All rotates are performed using the implicit rotate operand accepted by the 18 * 'add' and 'eor' instructions. This is faster than using explicit rotate 19 * instructions. To make this work, we allow the values in the second and last 20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the 21 * wrong rotation amount. The rotation amount is then fixed up just in time 22 * when the values are used. 'brot' is the number of bits the values in row 'b' 23 * need to be rotated right to arrive at the correct values, and 'drot' 24 * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 25 * that they end up as (25, 24) after every round. 26 */ 27 28 // ChaCha state registers 29 X0 .req r0 30 X1 .req r1 31 X2 .req r2 32 X3 .req r3 33 X4 .req r4 34 X5 .req r5 35 X6 .req r6 36 X7 .req r7 37 X8_X10 .req r8 // shared by x8 and x10 38 X9_X11 .req r9 // shared by x9 and x11 39 X12 .req r10 40 X13 .req r11 41 X14 .req r12 42 X15 .req r14 43 44.macro _le32_bswap_4x a, b, c, d, tmp 45#ifdef __ARMEB__ 46 rev_l \a, \tmp 47 rev_l \b, \tmp 48 rev_l \c, \tmp 49 rev_l \d, \tmp 50#endif 51.endm 52 53.macro __ldrd a, b, src, offset 54#if __LINUX_ARM_ARCH__ >= 6 55 ldrd \a, \b, [\src, #\offset] 56#else 57 ldr \a, [\src, #\offset] 58 ldr \b, [\src, #\offset + 4] 59#endif 60.endm 61 62.macro __strd a, b, dst, offset 63#if __LINUX_ARM_ARCH__ >= 6 64 strd \a, \b, [\dst, #\offset] 65#else 66 str \a, [\dst, #\offset] 67 str \b, [\dst, #\offset + 4] 68#endif 69.endm 70 71.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 72 73 // a += b; d ^= a; d = rol(d, 16); 74 add \a1, \a1, \b1, ror #brot 75 add \a2, \a2, \b2, ror #brot 76 eor \d1, \a1, \d1, ror #drot 77 eor \d2, \a2, \d2, ror #drot 78 // drot == 32 - 16 == 16 79 80 // c += d; b ^= c; b = rol(b, 12); 81 add \c1, \c1, \d1, ror #16 82 add \c2, \c2, \d2, ror #16 83 eor \b1, \c1, \b1, ror #brot 84 eor \b2, \c2, \b2, ror #brot 85 // brot == 32 - 12 == 20 86 87 // a += b; d ^= a; d = rol(d, 8); 88 add \a1, \a1, \b1, ror #20 89 add \a2, \a2, \b2, ror #20 90 eor \d1, \a1, \d1, ror #16 91 eor \d2, \a2, \d2, ror #16 92 // drot == 32 - 8 == 24 93 94 // c += d; b ^= c; b = rol(b, 7); 95 add \c1, \c1, \d1, ror #24 96 add \c2, \c2, \d2, ror #24 97 eor \b1, \c1, \b1, ror #20 98 eor \b2, \c2, \b2, ror #20 99 // brot == 32 - 7 == 25 100.endm 101 102.macro _doubleround 103 104 // column round 105 106 // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) 107 _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 108 109 // save (x8, x9); restore (x10, x11) 110 __strd X8_X10, X9_X11, sp, 0 111 __ldrd X8_X10, X9_X11, sp, 8 112 113 // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) 114 _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 115 116 .set brot, 25 117 .set drot, 24 118 119 // diagonal round 120 121 // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) 122 _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 123 124 // save (x10, x11); restore (x8, x9) 125 __strd X8_X10, X9_X11, sp, 8 126 __ldrd X8_X10, X9_X11, sp, 0 127 128 // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) 129 _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 130.endm 131 132.macro _chacha_permute nrounds 133 .set brot, 0 134 .set drot, 0 135 .rept \nrounds / 2 136 _doubleround 137 .endr 138.endm 139 140.macro _chacha nrounds 141 142.Lnext_block\@: 143 // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN 144 // Registers contain x0-x9,x12-x15. 145 146 // Do the core ChaCha permutation to update x0-x15. 147 _chacha_permute \nrounds 148 149 add sp, #8 150 // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN 151 // Registers contain x0-x9,x12-x15. 152 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 153 154 // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). 155 push {X8_X10, X9_X11, X12, X13, X14, X15} 156 157 // Load (OUT, IN, LEN). 158 ldr r14, [sp, #96] 159 ldr r12, [sp, #100] 160 ldr r11, [sp, #104] 161 162 orr r10, r14, r12 163 164 // Use slow path if fewer than 64 bytes remain. 165 cmp r11, #64 166 blt .Lxor_slowpath\@ 167 168 // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on 169 // ARMv6+, since ldmia and stmia (used below) still require alignment. 170 tst r10, #3 171 bne .Lxor_slowpath\@ 172 173 // Fast path: XOR 64 bytes of aligned data. 174 175 // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 176 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. 177 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 178 179 // x0-x3 180 __ldrd r8, r9, sp, 32 181 __ldrd r10, r11, sp, 40 182 add X0, X0, r8 183 add X1, X1, r9 184 add X2, X2, r10 185 add X3, X3, r11 186 _le32_bswap_4x X0, X1, X2, X3, r8 187 ldmia r12!, {r8-r11} 188 eor X0, X0, r8 189 eor X1, X1, r9 190 eor X2, X2, r10 191 eor X3, X3, r11 192 stmia r14!, {X0-X3} 193 194 // x4-x7 195 __ldrd r8, r9, sp, 48 196 __ldrd r10, r11, sp, 56 197 add X4, r8, X4, ror #brot 198 add X5, r9, X5, ror #brot 199 ldmia r12!, {X0-X3} 200 add X6, r10, X6, ror #brot 201 add X7, r11, X7, ror #brot 202 _le32_bswap_4x X4, X5, X6, X7, r8 203 eor X4, X4, X0 204 eor X5, X5, X1 205 eor X6, X6, X2 206 eor X7, X7, X3 207 stmia r14!, {X4-X7} 208 209 // x8-x15 210 pop {r0-r7} // (x8-x9,x12-x15,x10-x11) 211 __ldrd r8, r9, sp, 32 212 __ldrd r10, r11, sp, 40 213 add r0, r0, r8 // x8 214 add r1, r1, r9 // x9 215 add r6, r6, r10 // x10 216 add r7, r7, r11 // x11 217 _le32_bswap_4x r0, r1, r6, r7, r8 218 ldmia r12!, {r8-r11} 219 eor r0, r0, r8 // x8 220 eor r1, r1, r9 // x9 221 eor r6, r6, r10 // x10 222 eor r7, r7, r11 // x11 223 stmia r14!, {r0,r1,r6,r7} 224 ldmia r12!, {r0,r1,r6,r7} 225 __ldrd r8, r9, sp, 48 226 __ldrd r10, r11, sp, 56 227 add r2, r8, r2, ror #drot // x12 228 add r3, r9, r3, ror #drot // x13 229 add r4, r10, r4, ror #drot // x14 230 add r5, r11, r5, ror #drot // x15 231 _le32_bswap_4x r2, r3, r4, r5, r9 232 ldr r9, [sp, #72] // load LEN 233 eor r2, r2, r0 // x12 234 eor r3, r3, r1 // x13 235 eor r4, r4, r6 // x14 236 eor r5, r5, r7 // x15 237 subs r9, #64 // decrement and check LEN 238 stmia r14!, {r2-r5} 239 240 beq .Ldone\@ 241 242.Lprepare_for_next_block\@: 243 244 // Stack: x0-x15 OUT IN LEN 245 246 // Increment block counter (x12) 247 add r8, #1 248 249 // Store updated (OUT, IN, LEN) 250 str r14, [sp, #64] 251 str r12, [sp, #68] 252 str r9, [sp, #72] 253 254 mov r14, sp 255 256 // Store updated block counter (x12) 257 str r8, [sp, #48] 258 259 sub sp, #16 260 261 // Reload state and do next block 262 ldmia r14!, {r0-r11} // load x0-x11 263 __strd r10, r11, sp, 8 // store x10-x11 before state 264 ldmia r14, {r10-r12,r14} // load x12-x15 265 b .Lnext_block\@ 266 267.Lxor_slowpath\@: 268 // Slow path: < 64 bytes remaining, or unaligned input or output buffer. 269 // We handle it by storing the 64 bytes of keystream to the stack, then 270 // XOR-ing the needed portion with the data. 271 272 // Allocate keystream buffer 273 sub sp, #64 274 mov r14, sp 275 276 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN 277 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. 278 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. 279 280 // Save keystream for x0-x3 281 __ldrd r8, r9, sp, 96 282 __ldrd r10, r11, sp, 104 283 add X0, X0, r8 284 add X1, X1, r9 285 add X2, X2, r10 286 add X3, X3, r11 287 _le32_bswap_4x X0, X1, X2, X3, r8 288 stmia r14!, {X0-X3} 289 290 // Save keystream for x4-x7 291 __ldrd r8, r9, sp, 112 292 __ldrd r10, r11, sp, 120 293 add X4, r8, X4, ror #brot 294 add X5, r9, X5, ror #brot 295 add X6, r10, X6, ror #brot 296 add X7, r11, X7, ror #brot 297 _le32_bswap_4x X4, X5, X6, X7, r8 298 add r8, sp, #64 299 stmia r14!, {X4-X7} 300 301 // Save keystream for x8-x15 302 ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) 303 __ldrd r8, r9, sp, 128 304 __ldrd r10, r11, sp, 136 305 add r0, r0, r8 // x8 306 add r1, r1, r9 // x9 307 add r6, r6, r10 // x10 308 add r7, r7, r11 // x11 309 _le32_bswap_4x r0, r1, r6, r7, r8 310 stmia r14!, {r0,r1,r6,r7} 311 __ldrd r8, r9, sp, 144 312 __ldrd r10, r11, sp, 152 313 add r2, r8, r2, ror #drot // x12 314 add r3, r9, r3, ror #drot // x13 315 add r4, r10, r4, ror #drot // x14 316 add r5, r11, r5, ror #drot // x15 317 _le32_bswap_4x r2, r3, r4, r5, r9 318 stmia r14, {r2-r5} 319 320 // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN 321 // Registers: r8 is block counter, r12 is IN. 322 323 ldr r9, [sp, #168] // LEN 324 ldr r14, [sp, #160] // OUT 325 cmp r9, #64 326 mov r0, sp 327 movle r1, r9 328 movgt r1, #64 329 // r1 is number of bytes to XOR, in range [1, 64] 330 331.if __LINUX_ARM_ARCH__ < 6 332 orr r2, r12, r14 333 tst r2, #3 // IN or OUT misaligned? 334 bne .Lxor_next_byte\@ 335.endif 336 337 // XOR a word at a time 338.rept 16 339 subs r1, #4 340 blt .Lxor_words_done\@ 341 ldr r2, [r12], #4 342 ldr r3, [r0], #4 343 eor r2, r2, r3 344 str r2, [r14], #4 345.endr 346 b .Lxor_slowpath_done\@ 347.Lxor_words_done\@: 348 ands r1, r1, #3 349 beq .Lxor_slowpath_done\@ 350 351 // XOR a byte at a time 352.Lxor_next_byte\@: 353 ldrb r2, [r12], #1 354 ldrb r3, [r0], #1 355 eor r2, r2, r3 356 strb r2, [r14], #1 357 subs r1, #1 358 bne .Lxor_next_byte\@ 359 360.Lxor_slowpath_done\@: 361 subs r9, #64 362 add sp, #96 363 bgt .Lprepare_for_next_block\@ 364 365.Ldone\@: 366.endm // _chacha 367 368/* 369 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, 370 * const u32 *state, int nrounds); 371 */ 372ENTRY(chacha_doarm) 373 cmp r2, #0 // len == 0? 374 reteq lr 375 376 ldr ip, [sp] 377 cmp ip, #12 378 379 push {r0-r2,r4-r11,lr} 380 381 // Push state x0-x15 onto stack. 382 // Also store an extra copy of x10-x11 just before the state. 383 384 add X12, r3, #48 385 ldm X12, {X12,X13,X14,X15} 386 push {X12,X13,X14,X15} 387 sub sp, sp, #64 388 389 __ldrd X8_X10, X9_X11, r3, 40 390 __strd X8_X10, X9_X11, sp, 8 391 __strd X8_X10, X9_X11, sp, 56 392 ldm r3, {X0-X9_X11} 393 __strd X0, X1, sp, 16 394 __strd X2, X3, sp, 24 395 __strd X4, X5, sp, 32 396 __strd X6, X7, sp, 40 397 __strd X8_X10, X9_X11, sp, 48 398 399 beq 1f 400 _chacha 20 401 4020: add sp, #76 403 pop {r4-r11, pc} 404 4051: _chacha 12 406 b 0b 407ENDPROC(chacha_doarm) 408 409/* 410 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); 411 */ 412ENTRY(hchacha_block_arm) 413 push {r1,r4-r11,lr} 414 415 cmp r2, #12 // ChaCha12 ? 416 417 mov r14, r0 418 ldmia r14!, {r0-r11} // load x0-x11 419 push {r10-r11} // store x10-x11 to stack 420 ldm r14, {r10-r12,r14} // load x12-x15 421 sub sp, #8 422 423 beq 1f 424 _chacha_permute 20 425 426 // Skip over (unused0-unused1, x10-x11) 4270: add sp, #16 428 429 // Fix up rotations of x12-x15 430 ror X12, X12, #drot 431 ror X13, X13, #drot 432 pop {r4} // load 'out' 433 ror X14, X14, #drot 434 ror X15, X15, #drot 435 436 // Store (x0-x3,x12-x15) to 'out' 437 stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} 438 439 pop {r4-r11,pc} 440 4411: _chacha_permute 12 442 b 0b 443ENDPROC(hchacha_block_arm)