sha256-avx-asm.S (17476B)
1######################################################################## 2# Implement fast SHA-256 with AVX1 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38######################################################################## 39# 40# This code is described in an Intel White-Paper: 41# "Fast SHA-256 Implementations on Intel Architecture Processors" 42# 43# To find it, surf to http://www.intel.com/p/en_US/embedded 44# and search for that title. 45# 46######################################################################## 47# This code schedules 1 block at a time, with 4 lanes per block 48######################################################################## 49 50#include <linux/linkage.h> 51 52## assume buffers not aligned 53#define VMOVDQ vmovdqu 54 55################################ Define Macros 56 57# addm [mem], reg 58# Add reg to mem using reg-mem add and store 59.macro addm p1 p2 60 add \p1, \p2 61 mov \p2, \p1 62.endm 63 64 65.macro MY_ROR p1 p2 66 shld $(32-(\p1)), \p2, \p2 67.endm 68 69################################ 70 71# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 72# Load xmm with mem and byte swap each dword 73.macro COPY_XMM_AND_BSWAP p1 p2 p3 74 VMOVDQ \p2, \p1 75 vpshufb \p3, \p1, \p1 76.endm 77 78################################ 79 80X0 = %xmm4 81X1 = %xmm5 82X2 = %xmm6 83X3 = %xmm7 84 85XTMP0 = %xmm0 86XTMP1 = %xmm1 87XTMP2 = %xmm2 88XTMP3 = %xmm3 89XTMP4 = %xmm8 90XFER = %xmm9 91XTMP5 = %xmm11 92 93SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 94SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 95BYTE_FLIP_MASK = %xmm13 96 97NUM_BLKS = %rdx # 3rd arg 98INP = %rsi # 2nd arg 99CTX = %rdi # 1st arg 100 101SRND = %rsi # clobbers INP 102c = %ecx 103d = %r8d 104e = %edx 105TBL = %r12 106a = %eax 107b = %ebx 108 109f = %r9d 110g = %r10d 111h = %r11d 112 113y0 = %r13d 114y1 = %r14d 115y2 = %r15d 116 117 118_INP_END_SIZE = 8 119_INP_SIZE = 8 120_XFER_SIZE = 16 121_XMM_SAVE_SIZE = 0 122 123_INP_END = 0 124_INP = _INP_END + _INP_END_SIZE 125_XFER = _INP + _INP_SIZE 126_XMM_SAVE = _XFER + _XFER_SIZE 127STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 128 129# rotate_Xs 130# Rotate values of symbols X0...X3 131.macro rotate_Xs 132X_ = X0 133X0 = X1 134X1 = X2 135X2 = X3 136X3 = X_ 137.endm 138 139# ROTATE_ARGS 140# Rotate values of symbols a...h 141.macro ROTATE_ARGS 142TMP_ = h 143h = g 144g = f 145f = e 146e = d 147d = c 148c = b 149b = a 150a = TMP_ 151.endm 152 153.macro FOUR_ROUNDS_AND_SCHED 154 ## compute s0 four at a time and s1 two at a time 155 ## compute W[-16] + W[-7] 4 at a time 156 157 mov e, y0 # y0 = e 158 MY_ROR (25-11), y0 # y0 = e >> (25-11) 159 mov a, y1 # y1 = a 160 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 161 MY_ROR (22-13), y1 # y1 = a >> (22-13) 162 xor e, y0 # y0 = e ^ (e >> (25-11)) 163 mov f, y2 # y2 = f 164 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 165 xor a, y1 # y1 = a ^ (a >> (22-13) 166 xor g, y2 # y2 = f^g 167 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] 168 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 169 and e, y2 # y2 = (f^g)&e 170 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 171 ## compute s0 172 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 173 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 174 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 175 xor g, y2 # y2 = CH = ((f^g)&e)^g 176 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 177 add y0, y2 # y2 = S1 + CH 178 add _XFER(%rsp), y2 # y2 = k + w + S1 + CH 179 mov a, y0 # y0 = a 180 add y2, h # h = h + S1 + CH + k + w 181 mov a, y2 # y2 = a 182 vpsrld $7, XTMP1, XTMP2 183 or c, y0 # y0 = a|c 184 add h, d # d = d + h + S1 + CH + k + w 185 and c, y2 # y2 = a&c 186 vpslld $(32-7), XTMP1, XTMP3 187 and b, y0 # y0 = (a|c)&b 188 add y1, h # h = h + S1 + CH + k + w + S0 189 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 190 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 191 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 192 ROTATE_ARGS 193 mov e, y0 # y0 = e 194 mov a, y1 # y1 = a 195 MY_ROR (25-11), y0 # y0 = e >> (25-11) 196 xor e, y0 # y0 = e ^ (e >> (25-11)) 197 mov f, y2 # y2 = f 198 MY_ROR (22-13), y1 # y1 = a >> (22-13) 199 vpsrld $18, XTMP1, XTMP2 # 200 xor a, y1 # y1 = a ^ (a >> (22-13) 201 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 202 xor g, y2 # y2 = f^g 203 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 204 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 206 and e, y2 # y2 = (f^g)&e 207 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 208 vpslld $(32-18), XTMP1, XTMP1 209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 210 xor g, y2 # y2 = CH = ((f^g)&e)^g 211 vpxor XTMP1, XTMP3, XTMP3 # 212 add y0, y2 # y2 = S1 + CH 213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 214 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 215 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 216 mov a, y0 # y0 = a 217 add y2, h # h = h + S1 + CH + k + w 218 mov a, y2 # y2 = a 219 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 220 or c, y0 # y0 = a|c 221 add h, d # d = d + h + S1 + CH + k + w 222 and c, y2 # y2 = a&c 223 ## compute low s1 224 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 225 and b, y0 # y0 = (a|c)&b 226 add y1, h # h = h + S1 + CH + k + w + S0 227 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 230 ROTATE_ARGS 231 mov e, y0 # y0 = e 232 mov a, y1 # y1 = a 233 MY_ROR (25-11), y0 # y0 = e >> (25-11) 234 xor e, y0 # y0 = e ^ (e >> (25-11)) 235 MY_ROR (22-13), y1 # y1 = a >> (22-13) 236 mov f, y2 # y2 = f 237 xor a, y1 # y1 = a ^ (a >> (22-13) 238 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 239 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 240 xor g, y2 # y2 = f^g 241 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} 242 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 243 and e, y2 # y2 = (f^g)&e 244 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} 245 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 246 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 247 xor g, y2 # y2 = CH = ((f^g)&e)^g 248 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 249 vpxor XTMP3, XTMP2, XTMP2 # 250 add y0, y2 # y2 = S1 + CH 251 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 252 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 253 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 254 mov a, y0 # y0 = a 255 add y2, h # h = h + S1 + CH + k + w 256 mov a, y2 # y2 = a 257 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 258 or c, y0 # y0 = a|c 259 add h, d # d = d + h + S1 + CH + k + w 260 and c, y2 # y2 = a&c 261 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 262 and b, y0 # y0 = (a|c)&b 263 add y1, h # h = h + S1 + CH + k + w + S0 264 ## compute high s1 265 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 266 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 267 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 268 ROTATE_ARGS 269 mov e, y0 # y0 = e 270 MY_ROR (25-11), y0 # y0 = e >> (25-11) 271 mov a, y1 # y1 = a 272 MY_ROR (22-13), y1 # y1 = a >> (22-13) 273 xor e, y0 # y0 = e ^ (e >> (25-11)) 274 mov f, y2 # y2 = f 275 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 276 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 277 xor a, y1 # y1 = a ^ (a >> (22-13) 278 xor g, y2 # y2 = f^g 279 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} 280 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 281 and e, y2 # y2 = (f^g)&e 282 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 283 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} 284 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 285 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 286 xor g, y2 # y2 = CH = ((f^g)&e)^g 287 vpxor XTMP3, XTMP2, XTMP2 288 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 289 add y0, y2 # y2 = S1 + CH 290 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 291 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 292 mov a, y0 # y0 = a 293 add y2, h # h = h + S1 + CH + k + w 294 mov a, y2 # y2 = a 295 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 296 or c, y0 # y0 = a|c 297 add h, d # d = d + h + S1 + CH + k + w 298 and c, y2 # y2 = a&c 299 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 300 and b, y0 # y0 = (a|c)&b 301 add y1, h # h = h + S1 + CH + k + w + S0 302 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 303 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 304 ROTATE_ARGS 305 rotate_Xs 306.endm 307 308## input is [rsp + _XFER + %1 * 4] 309.macro DO_ROUND round 310 mov e, y0 # y0 = e 311 MY_ROR (25-11), y0 # y0 = e >> (25-11) 312 mov a, y1 # y1 = a 313 xor e, y0 # y0 = e ^ (e >> (25-11)) 314 MY_ROR (22-13), y1 # y1 = a >> (22-13) 315 mov f, y2 # y2 = f 316 xor a, y1 # y1 = a ^ (a >> (22-13) 317 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 318 xor g, y2 # y2 = f^g 319 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 320 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 321 and e, y2 # y2 = (f^g)&e 322 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 323 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 324 xor g, y2 # y2 = CH = ((f^g)&e)^g 325 add y0, y2 # y2 = S1 + CH 326 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 327 offset = \round * 4 + _XFER # 328 add offset(%rsp), y2 # y2 = k + w + S1 + CH 329 mov a, y0 # y0 = a 330 add y2, h # h = h + S1 + CH + k + w 331 mov a, y2 # y2 = a 332 or c, y0 # y0 = a|c 333 add h, d # d = d + h + S1 + CH + k + w 334 and c, y2 # y2 = a&c 335 and b, y0 # y0 = (a|c)&b 336 add y1, h # h = h + S1 + CH + k + w + S0 337 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 338 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 339 ROTATE_ARGS 340.endm 341 342######################################################################## 343## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) 344## arg 1 : pointer to state 345## arg 2 : pointer to input data 346## arg 3 : Num blocks 347######################################################################## 348.text 349SYM_FUNC_START(sha256_transform_avx) 350.align 32 351 pushq %rbx 352 pushq %r12 353 pushq %r13 354 pushq %r14 355 pushq %r15 356 pushq %rbp 357 movq %rsp, %rbp 358 359 subq $STACK_SIZE, %rsp # allocate stack space 360 and $~15, %rsp # align stack pointer 361 362 shl $6, NUM_BLKS # convert to bytes 363 jz done_hash 364 add INP, NUM_BLKS # pointer to end of data 365 mov NUM_BLKS, _INP_END(%rsp) 366 367 ## load initial digest 368 mov 4*0(CTX), a 369 mov 4*1(CTX), b 370 mov 4*2(CTX), c 371 mov 4*3(CTX), d 372 mov 4*4(CTX), e 373 mov 4*5(CTX), f 374 mov 4*6(CTX), g 375 mov 4*7(CTX), h 376 377 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 378 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 379 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 380loop0: 381 lea K256(%rip), TBL 382 383 ## byte swap first 16 dwords 384 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 385 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 386 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 387 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 388 389 mov INP, _INP(%rsp) 390 391 ## schedule 48 input dwords, by doing 3 rounds of 16 each 392 mov $3, SRND 393.align 16 394loop1: 395 vpaddd (TBL), X0, XFER 396 vmovdqa XFER, _XFER(%rsp) 397 FOUR_ROUNDS_AND_SCHED 398 399 vpaddd 1*16(TBL), X0, XFER 400 vmovdqa XFER, _XFER(%rsp) 401 FOUR_ROUNDS_AND_SCHED 402 403 vpaddd 2*16(TBL), X0, XFER 404 vmovdqa XFER, _XFER(%rsp) 405 FOUR_ROUNDS_AND_SCHED 406 407 vpaddd 3*16(TBL), X0, XFER 408 vmovdqa XFER, _XFER(%rsp) 409 add $4*16, TBL 410 FOUR_ROUNDS_AND_SCHED 411 412 sub $1, SRND 413 jne loop1 414 415 mov $2, SRND 416loop2: 417 vpaddd (TBL), X0, XFER 418 vmovdqa XFER, _XFER(%rsp) 419 DO_ROUND 0 420 DO_ROUND 1 421 DO_ROUND 2 422 DO_ROUND 3 423 424 vpaddd 1*16(TBL), X1, XFER 425 vmovdqa XFER, _XFER(%rsp) 426 add $2*16, TBL 427 DO_ROUND 0 428 DO_ROUND 1 429 DO_ROUND 2 430 DO_ROUND 3 431 432 vmovdqa X2, X0 433 vmovdqa X3, X1 434 435 sub $1, SRND 436 jne loop2 437 438 addm (4*0)(CTX),a 439 addm (4*1)(CTX),b 440 addm (4*2)(CTX),c 441 addm (4*3)(CTX),d 442 addm (4*4)(CTX),e 443 addm (4*5)(CTX),f 444 addm (4*6)(CTX),g 445 addm (4*7)(CTX),h 446 447 mov _INP(%rsp), INP 448 add $64, INP 449 cmp _INP_END(%rsp), INP 450 jne loop0 451 452done_hash: 453 454 mov %rbp, %rsp 455 popq %rbp 456 popq %r15 457 popq %r14 458 popq %r13 459 popq %r12 460 popq %rbx 461 RET 462SYM_FUNC_END(sha256_transform_avx) 463 464.section .rodata.cst256.K256, "aM", @progbits, 256 465.align 64 466K256: 467 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 468 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 469 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 470 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 471 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 472 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 473 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 474 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 475 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 476 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 477 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 478 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 479 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 480 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 481 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 482 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 483 484.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 485.align 16 486PSHUFFLE_BYTE_FLIP_MASK: 487 .octa 0x0c0d0e0f08090a0b0405060700010203 488 489.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 490.align 16 491# shuffle xBxA -> 00BA 492_SHUF_00BA: 493 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 494 495.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 496.align 16 497# shuffle xDxC -> DC00 498_SHUF_DC00: 499 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF