sha1_ssse3_asm.S (11376B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 4 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 5 * processors. CPUs supporting Intel(R) AVX extensions will get an additional 6 * boost. 7 * 8 * This work was inspired by the vectorized implementation of Dean Gaudet. 9 * Additional information on it can be found at: 10 * http://www.arctic.org/~dean/crypto/sha1.html 11 * 12 * It was improved upon with more efficient vectorization of the message 13 * scheduling. This implementation has also been optimized for all current and 14 * several future generations of Intel CPUs. 15 * 16 * See this article for more information about the implementation details: 17 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 18 * 19 * Copyright (C) 2010, Intel Corp. 20 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 21 * Ronen Zohar <ronen.zohar@intel.com> 22 * 23 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 */ 26 27#include <linux/linkage.h> 28 29#define CTX %rdi // arg1 30#define BUF %rsi // arg2 31#define CNT %rdx // arg3 32 33#define REG_A %ecx 34#define REG_B %esi 35#define REG_C %edi 36#define REG_D %r12d 37#define REG_E %edx 38 39#define REG_T1 %eax 40#define REG_T2 %ebx 41 42#define K_BASE %r8 43#define HASH_PTR %r9 44#define BUFFER_PTR %r10 45#define BUFFER_END %r11 46 47#define W_TMP1 %xmm0 48#define W_TMP2 %xmm9 49 50#define W0 %xmm1 51#define W4 %xmm2 52#define W8 %xmm3 53#define W12 %xmm4 54#define W16 %xmm5 55#define W20 %xmm6 56#define W24 %xmm7 57#define W28 %xmm8 58 59#define XMM_SHUFB_BSWAP %xmm10 60 61/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 62#define WK(t) (((t) & 15) * 4)(%rsp) 63#define W_PRECALC_AHEAD 16 64 65/* 66 * This macro implements the SHA-1 function's body for single 64-byte block 67 * param: function's name 68 */ 69.macro SHA1_VECTOR_ASM name 70 SYM_FUNC_START(\name) 71 72 push %rbx 73 push %r12 74 push %rbp 75 mov %rsp, %rbp 76 77 sub $64, %rsp # allocate workspace 78 and $~15, %rsp # align stack 79 80 mov CTX, HASH_PTR 81 mov BUF, BUFFER_PTR 82 83 shl $6, CNT # multiply by 64 84 add BUF, CNT 85 mov CNT, BUFFER_END 86 87 lea K_XMM_AR(%rip), K_BASE 88 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 89 90 SHA1_PIPELINED_MAIN_BODY 91 92 # cleanup workspace 93 mov $8, %ecx 94 mov %rsp, %rdi 95 xor %eax, %eax 96 rep stosq 97 98 mov %rbp, %rsp # deallocate workspace 99 pop %rbp 100 pop %r12 101 pop %rbx 102 RET 103 104 SYM_FUNC_END(\name) 105.endm 106 107/* 108 * This macro implements 80 rounds of SHA-1 for one 64-byte block 109 */ 110.macro SHA1_PIPELINED_MAIN_BODY 111 INIT_REGALLOC 112 113 mov (HASH_PTR), A 114 mov 4(HASH_PTR), B 115 mov 8(HASH_PTR), C 116 mov 12(HASH_PTR), D 117 mov 16(HASH_PTR), E 118 119 .set i, 0 120 .rept W_PRECALC_AHEAD 121 W_PRECALC i 122 .set i, (i+1) 123 .endr 124 125.align 4 1261: 127 RR F1,A,B,C,D,E,0 128 RR F1,D,E,A,B,C,2 129 RR F1,B,C,D,E,A,4 130 RR F1,E,A,B,C,D,6 131 RR F1,C,D,E,A,B,8 132 133 RR F1,A,B,C,D,E,10 134 RR F1,D,E,A,B,C,12 135 RR F1,B,C,D,E,A,14 136 RR F1,E,A,B,C,D,16 137 RR F1,C,D,E,A,B,18 138 139 RR F2,A,B,C,D,E,20 140 RR F2,D,E,A,B,C,22 141 RR F2,B,C,D,E,A,24 142 RR F2,E,A,B,C,D,26 143 RR F2,C,D,E,A,B,28 144 145 RR F2,A,B,C,D,E,30 146 RR F2,D,E,A,B,C,32 147 RR F2,B,C,D,E,A,34 148 RR F2,E,A,B,C,D,36 149 RR F2,C,D,E,A,B,38 150 151 RR F3,A,B,C,D,E,40 152 RR F3,D,E,A,B,C,42 153 RR F3,B,C,D,E,A,44 154 RR F3,E,A,B,C,D,46 155 RR F3,C,D,E,A,B,48 156 157 RR F3,A,B,C,D,E,50 158 RR F3,D,E,A,B,C,52 159 RR F3,B,C,D,E,A,54 160 RR F3,E,A,B,C,D,56 161 RR F3,C,D,E,A,B,58 162 163 add $64, BUFFER_PTR # move to the next 64-byte block 164 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 165 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 166 167 RR F4,A,B,C,D,E,60 168 RR F4,D,E,A,B,C,62 169 RR F4,B,C,D,E,A,64 170 RR F4,E,A,B,C,D,66 171 RR F4,C,D,E,A,B,68 172 173 RR F4,A,B,C,D,E,70 174 RR F4,D,E,A,B,C,72 175 RR F4,B,C,D,E,A,74 176 RR F4,E,A,B,C,D,76 177 RR F4,C,D,E,A,B,78 178 179 UPDATE_HASH (HASH_PTR), A 180 UPDATE_HASH 4(HASH_PTR), B 181 UPDATE_HASH 8(HASH_PTR), C 182 UPDATE_HASH 12(HASH_PTR), D 183 UPDATE_HASH 16(HASH_PTR), E 184 185 RESTORE_RENAMED_REGS 186 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 187 jne 1b 188.endm 189 190.macro INIT_REGALLOC 191 .set A, REG_A 192 .set B, REG_B 193 .set C, REG_C 194 .set D, REG_D 195 .set E, REG_E 196 .set T1, REG_T1 197 .set T2, REG_T2 198.endm 199 200.macro RESTORE_RENAMED_REGS 201 # order is important (REG_C is where it should be) 202 mov B, REG_B 203 mov D, REG_D 204 mov A, REG_A 205 mov E, REG_E 206.endm 207 208.macro SWAP_REG_NAMES a, b 209 .set _T, \a 210 .set \a, \b 211 .set \b, _T 212.endm 213 214.macro F1 b, c, d 215 mov \c, T1 216 SWAP_REG_NAMES \c, T1 217 xor \d, T1 218 and \b, T1 219 xor \d, T1 220.endm 221 222.macro F2 b, c, d 223 mov \d, T1 224 SWAP_REG_NAMES \d, T1 225 xor \c, T1 226 xor \b, T1 227.endm 228 229.macro F3 b, c ,d 230 mov \c, T1 231 SWAP_REG_NAMES \c, T1 232 mov \b, T2 233 or \b, T1 234 and \c, T2 235 and \d, T1 236 or T2, T1 237.endm 238 239.macro F4 b, c, d 240 F2 \b, \c, \d 241.endm 242 243.macro UPDATE_HASH hash, val 244 add \hash, \val 245 mov \val, \hash 246.endm 247 248/* 249 * RR does two rounds of SHA-1 back to back with W[] pre-calc 250 * t1 = F(b, c, d); e += w(i) 251 * e += t1; b <<= 30; d += w(i+1); 252 * t1 = F(a, b, c); 253 * d += t1; a <<= 5; 254 * e += a; 255 * t1 = e; a >>= 7; 256 * t1 <<= 5; 257 * d += t1; 258 */ 259.macro RR F, a, b, c, d, e, round 260 add WK(\round), \e 261 \F \b, \c, \d # t1 = F(b, c, d); 262 W_PRECALC (\round + W_PRECALC_AHEAD) 263 rol $30, \b 264 add T1, \e 265 add WK(\round + 1), \d 266 267 \F \a, \b, \c 268 W_PRECALC (\round + W_PRECALC_AHEAD + 1) 269 rol $5, \a 270 add \a, \e 271 add T1, \d 272 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 273 274 mov \e, T1 275 SWAP_REG_NAMES \e, T1 276 277 rol $5, T1 278 add T1, \d 279 280 # write: \a, \b 281 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 282.endm 283 284.macro W_PRECALC r 285 .set i, \r 286 287 .if (i < 20) 288 .set K_XMM, 0 289 .elseif (i < 40) 290 .set K_XMM, 16 291 .elseif (i < 60) 292 .set K_XMM, 32 293 .elseif (i < 80) 294 .set K_XMM, 48 295 .endif 296 297 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 298 .set i, ((\r) % 80) # pre-compute for the next iteration 299 .if (i == 0) 300 W_PRECALC_RESET 301 .endif 302 W_PRECALC_00_15 303 .elseif (i<32) 304 W_PRECALC_16_31 305 .elseif (i < 80) // rounds 32-79 306 W_PRECALC_32_79 307 .endif 308.endm 309 310.macro W_PRECALC_RESET 311 .set W, W0 312 .set W_minus_04, W4 313 .set W_minus_08, W8 314 .set W_minus_12, W12 315 .set W_minus_16, W16 316 .set W_minus_20, W20 317 .set W_minus_24, W24 318 .set W_minus_28, W28 319 .set W_minus_32, W 320.endm 321 322.macro W_PRECALC_ROTATE 323 .set W_minus_32, W_minus_28 324 .set W_minus_28, W_minus_24 325 .set W_minus_24, W_minus_20 326 .set W_minus_20, W_minus_16 327 .set W_minus_16, W_minus_12 328 .set W_minus_12, W_minus_08 329 .set W_minus_08, W_minus_04 330 .set W_minus_04, W 331 .set W, W_minus_32 332.endm 333 334.macro W_PRECALC_SSSE3 335 336.macro W_PRECALC_00_15 337 W_PRECALC_00_15_SSSE3 338.endm 339.macro W_PRECALC_16_31 340 W_PRECALC_16_31_SSSE3 341.endm 342.macro W_PRECALC_32_79 343 W_PRECALC_32_79_SSSE3 344.endm 345 346/* message scheduling pre-compute for rounds 0-15 */ 347.macro W_PRECALC_00_15_SSSE3 348 .if ((i & 3) == 0) 349 movdqu (i*4)(BUFFER_PTR), W_TMP1 350 .elseif ((i & 3) == 1) 351 pshufb XMM_SHUFB_BSWAP, W_TMP1 352 movdqa W_TMP1, W 353 .elseif ((i & 3) == 2) 354 paddd (K_BASE), W_TMP1 355 .elseif ((i & 3) == 3) 356 movdqa W_TMP1, WK(i&~3) 357 W_PRECALC_ROTATE 358 .endif 359.endm 360 361/* message scheduling pre-compute for rounds 16-31 362 * 363 * - calculating last 32 w[i] values in 8 XMM registers 364 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 365 * instruction 366 * 367 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 368 * dependency, but improves for 32-79 369 */ 370.macro W_PRECALC_16_31_SSSE3 371 # blended scheduling of vector and scalar instruction streams, one 4-wide 372 # vector iteration / 4 scalar rounds 373 .if ((i & 3) == 0) 374 movdqa W_minus_12, W 375 palignr $8, W_minus_16, W # w[i-14] 376 movdqa W_minus_04, W_TMP1 377 psrldq $4, W_TMP1 # w[i-3] 378 pxor W_minus_08, W 379 .elseif ((i & 3) == 1) 380 pxor W_minus_16, W_TMP1 381 pxor W_TMP1, W 382 movdqa W, W_TMP2 383 movdqa W, W_TMP1 384 pslldq $12, W_TMP2 385 .elseif ((i & 3) == 2) 386 psrld $31, W 387 pslld $1, W_TMP1 388 por W, W_TMP1 389 movdqa W_TMP2, W 390 psrld $30, W_TMP2 391 pslld $2, W 392 .elseif ((i & 3) == 3) 393 pxor W, W_TMP1 394 pxor W_TMP2, W_TMP1 395 movdqa W_TMP1, W 396 paddd K_XMM(K_BASE), W_TMP1 397 movdqa W_TMP1, WK(i&~3) 398 W_PRECALC_ROTATE 399 .endif 400.endm 401 402/* message scheduling pre-compute for rounds 32-79 403 * 404 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 405 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 406 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 407 */ 408.macro W_PRECALC_32_79_SSSE3 409 .if ((i & 3) == 0) 410 movdqa W_minus_04, W_TMP1 411 pxor W_minus_28, W # W is W_minus_32 before xor 412 palignr $8, W_minus_08, W_TMP1 413 .elseif ((i & 3) == 1) 414 pxor W_minus_16, W 415 pxor W_TMP1, W 416 movdqa W, W_TMP1 417 .elseif ((i & 3) == 2) 418 psrld $30, W 419 pslld $2, W_TMP1 420 por W, W_TMP1 421 .elseif ((i & 3) == 3) 422 movdqa W_TMP1, W 423 paddd K_XMM(K_BASE), W_TMP1 424 movdqa W_TMP1, WK(i&~3) 425 W_PRECALC_ROTATE 426 .endif 427.endm 428 429.endm // W_PRECALC_SSSE3 430 431 432#define K1 0x5a827999 433#define K2 0x6ed9eba1 434#define K3 0x8f1bbcdc 435#define K4 0xca62c1d6 436 437.section .rodata 438.align 16 439 440K_XMM_AR: 441 .long K1, K1, K1, K1 442 .long K2, K2, K2, K2 443 .long K3, K3, K3, K3 444 .long K4, K4, K4, K4 445 446BSWAP_SHUFB_CTL: 447 .long 0x00010203 448 .long 0x04050607 449 .long 0x08090a0b 450 .long 0x0c0d0e0f 451 452 453.section .text 454 455W_PRECALC_SSSE3 456.macro xmm_mov a, b 457 movdqu \a,\b 458.endm 459 460/* 461 * SSSE3 optimized implementation: 462 * 463 * extern "C" void sha1_transform_ssse3(struct sha1_state *state, 464 * const u8 *data, int blocks); 465 * 466 * Note that struct sha1_state is assumed to begin with u32 state[5]. 467 */ 468SHA1_VECTOR_ASM sha1_transform_ssse3 469 470.macro W_PRECALC_AVX 471 472.purgem W_PRECALC_00_15 473.macro W_PRECALC_00_15 474 W_PRECALC_00_15_AVX 475.endm 476.purgem W_PRECALC_16_31 477.macro W_PRECALC_16_31 478 W_PRECALC_16_31_AVX 479.endm 480.purgem W_PRECALC_32_79 481.macro W_PRECALC_32_79 482 W_PRECALC_32_79_AVX 483.endm 484 485.macro W_PRECALC_00_15_AVX 486 .if ((i & 3) == 0) 487 vmovdqu (i*4)(BUFFER_PTR), W_TMP1 488 .elseif ((i & 3) == 1) 489 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 490 .elseif ((i & 3) == 2) 491 vpaddd (K_BASE), W, W_TMP1 492 .elseif ((i & 3) == 3) 493 vmovdqa W_TMP1, WK(i&~3) 494 W_PRECALC_ROTATE 495 .endif 496.endm 497 498.macro W_PRECALC_16_31_AVX 499 .if ((i & 3) == 0) 500 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 501 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 502 vpxor W_minus_08, W, W 503 vpxor W_minus_16, W_TMP1, W_TMP1 504 .elseif ((i & 3) == 1) 505 vpxor W_TMP1, W, W 506 vpslldq $12, W, W_TMP2 507 vpslld $1, W, W_TMP1 508 .elseif ((i & 3) == 2) 509 vpsrld $31, W, W 510 vpor W, W_TMP1, W_TMP1 511 vpslld $2, W_TMP2, W 512 vpsrld $30, W_TMP2, W_TMP2 513 .elseif ((i & 3) == 3) 514 vpxor W, W_TMP1, W_TMP1 515 vpxor W_TMP2, W_TMP1, W 516 vpaddd K_XMM(K_BASE), W, W_TMP1 517 vmovdqu W_TMP1, WK(i&~3) 518 W_PRECALC_ROTATE 519 .endif 520.endm 521 522.macro W_PRECALC_32_79_AVX 523 .if ((i & 3) == 0) 524 vpalignr $8, W_minus_08, W_minus_04, W_TMP1 525 vpxor W_minus_28, W, W # W is W_minus_32 before xor 526 .elseif ((i & 3) == 1) 527 vpxor W_minus_16, W_TMP1, W_TMP1 528 vpxor W_TMP1, W, W 529 .elseif ((i & 3) == 2) 530 vpslld $2, W, W_TMP1 531 vpsrld $30, W, W 532 vpor W, W_TMP1, W 533 .elseif ((i & 3) == 3) 534 vpaddd K_XMM(K_BASE), W, W_TMP1 535 vmovdqu W_TMP1, WK(i&~3) 536 W_PRECALC_ROTATE 537 .endif 538.endm 539 540.endm // W_PRECALC_AVX 541 542W_PRECALC_AVX 543.purgem xmm_mov 544.macro xmm_mov a, b 545 vmovdqu \a,\b 546.endm 547 548 549/* AVX optimized implementation: 550 * extern "C" void sha1_transform_avx(struct sha1_state *state, 551 * const u8 *data, int blocks); 552 */ 553SHA1_VECTOR_ASM sha1_transform_avx