cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sha256-avx-asm.S (17476B)


      1########################################################################
      2# Implement fast SHA-256 with AVX1 instructions. (x86_64)
      3#
      4# Copyright (C) 2013 Intel Corporation.
      5#
      6# Authors:
      7#     James Guilford <james.guilford@intel.com>
      8#     Kirk Yap <kirk.s.yap@intel.com>
      9#     Tim Chen <tim.c.chen@linux.intel.com>
     10#
     11# This software is available to you under a choice of one of two
     12# licenses.  You may choose to be licensed under the terms of the GNU
     13# General Public License (GPL) Version 2, available from the file
     14# COPYING in the main directory of this source tree, or the
     15# OpenIB.org BSD license below:
     16#
     17#     Redistribution and use in source and binary forms, with or
     18#     without modification, are permitted provided that the following
     19#     conditions are met:
     20#
     21#      - Redistributions of source code must retain the above
     22#        copyright notice, this list of conditions and the following
     23#        disclaimer.
     24#
     25#      - Redistributions in binary form must reproduce the above
     26#        copyright notice, this list of conditions and the following
     27#        disclaimer in the documentation and/or other materials
     28#        provided with the distribution.
     29#
     30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     37# SOFTWARE.
     38########################################################################
     39#
     40# This code is described in an Intel White-Paper:
     41# "Fast SHA-256 Implementations on Intel Architecture Processors"
     42#
     43# To find it, surf to http://www.intel.com/p/en_US/embedded
     44# and search for that title.
     45#
     46########################################################################
     47# This code schedules 1 block at a time, with 4 lanes per block
     48########################################################################
     49
     50#include <linux/linkage.h>
     51
     52## assume buffers not aligned
     53#define    VMOVDQ vmovdqu
     54
     55################################ Define Macros
     56
     57# addm [mem], reg
     58# Add reg to mem using reg-mem add and store
     59.macro addm p1 p2
     60	add     \p1, \p2
     61	mov     \p2, \p1
     62.endm
     63
     64
     65.macro MY_ROR p1 p2
     66	shld    $(32-(\p1)), \p2, \p2
     67.endm
     68
     69################################
     70
     71# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
     72# Load xmm with mem and byte swap each dword
     73.macro COPY_XMM_AND_BSWAP p1 p2 p3
     74	VMOVDQ \p2, \p1
     75	vpshufb \p3, \p1, \p1
     76.endm
     77
     78################################
     79
     80X0 = %xmm4
     81X1 = %xmm5
     82X2 = %xmm6
     83X3 = %xmm7
     84
     85XTMP0 = %xmm0
     86XTMP1 = %xmm1
     87XTMP2 = %xmm2
     88XTMP3 = %xmm3
     89XTMP4 = %xmm8
     90XFER = %xmm9
     91XTMP5 = %xmm11
     92
     93SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
     94SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
     95BYTE_FLIP_MASK = %xmm13
     96
     97NUM_BLKS = %rdx   # 3rd arg
     98INP = %rsi        # 2nd arg
     99CTX = %rdi        # 1st arg
    100
    101SRND = %rsi       # clobbers INP
    102c = %ecx
    103d = %r8d
    104e = %edx
    105TBL = %r12
    106a = %eax
    107b = %ebx
    108
    109f = %r9d
    110g = %r10d
    111h = %r11d
    112
    113y0 = %r13d
    114y1 = %r14d
    115y2 = %r15d
    116
    117
    118_INP_END_SIZE = 8
    119_INP_SIZE = 8
    120_XFER_SIZE = 16
    121_XMM_SAVE_SIZE = 0
    122
    123_INP_END = 0
    124_INP            = _INP_END  + _INP_END_SIZE
    125_XFER           = _INP      + _INP_SIZE
    126_XMM_SAVE       = _XFER     + _XFER_SIZE
    127STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
    128
    129# rotate_Xs
    130# Rotate values of symbols X0...X3
    131.macro rotate_Xs
    132X_ = X0
    133X0 = X1
    134X1 = X2
    135X2 = X3
    136X3 = X_
    137.endm
    138
    139# ROTATE_ARGS
    140# Rotate values of symbols a...h
    141.macro ROTATE_ARGS
    142TMP_ = h
    143h = g
    144g = f
    145f = e
    146e = d
    147d = c
    148c = b
    149b = a
    150a = TMP_
    151.endm
    152
    153.macro FOUR_ROUNDS_AND_SCHED
    154	## compute s0 four at a time and s1 two at a time
    155	## compute W[-16] + W[-7] 4 at a time
    156
    157	mov     e, y0			# y0 = e
    158	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
    159	mov     a, y1                   # y1 = a
    160	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
    161	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
    162	xor     e, y0                   # y0 = e ^ (e >> (25-11))
    163	mov     f, y2                   # y2 = f
    164	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
    165	xor     a, y1                   # y1 = a ^ (a >> (22-13)
    166	xor     g, y2                   # y2 = f^g
    167	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
    168	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    169	and     e, y2                   # y2 = (f^g)&e
    170	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
    171	## compute s0
    172	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
    173	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    174	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    175	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
    176	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    177	add     y0, y2                  # y2 = S1 + CH
    178	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
    179	mov     a, y0                   # y0 = a
    180	add     y2, h                   # h = h + S1 + CH + k + w
    181	mov     a, y2                   # y2 = a
    182	vpsrld  $7, XTMP1, XTMP2
    183	or      c, y0                   # y0 = a|c
    184	add     h, d                    # d = d + h + S1 + CH + k + w
    185	and     c, y2                   # y2 = a&c
    186	vpslld  $(32-7), XTMP1, XTMP3
    187	and     b, y0                   # y0 = (a|c)&b
    188	add     y1, h                   # h = h + S1 + CH + k + w + S0
    189	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
    190	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
    191	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
    192	ROTATE_ARGS
    193	mov     e, y0                   # y0 = e
    194	mov     a, y1                   # y1 = a
    195	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
    196	xor     e, y0                   # y0 = e ^ (e >> (25-11))
    197	mov     f, y2                   # y2 = f
    198	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
    199	vpsrld  $18, XTMP1, XTMP2       #
    200	xor     a, y1                   # y1 = a ^ (a >> (22-13)
    201	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
    202	xor     g, y2                   # y2 = f^g
    203	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
    204	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
    205	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    206	and     e, y2                   # y2 = (f^g)&e
    207	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    208	vpslld  $(32-18), XTMP1, XTMP1
    209	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    210	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
    211	vpxor   XTMP1, XTMP3, XTMP3     #
    212	add     y0, y2                  # y2 = S1 + CH
    213	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
    214	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    215	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
    216	mov     a, y0                   # y0 = a
    217	add     y2, h                   # h = h + S1 + CH + k + w
    218	mov     a, y2                   # y2 = a
    219	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
    220	or      c, y0                   # y0 = a|c
    221	add     h, d                    # d = d + h + S1 + CH + k + w
    222	and     c, y2                   # y2 = a&c
    223	## compute low s1
    224	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
    225	and     b, y0                   # y0 = (a|c)&b
    226	add     y1, h                   # h = h + S1 + CH + k + w + S0
    227	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
    228	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
    229	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
    230	ROTATE_ARGS
    231	mov     e, y0                   # y0 = e
    232	mov     a, y1                   # y1 = a
    233	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
    234	xor     e, y0                   # y0 = e ^ (e >> (25-11))
    235	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
    236	mov     f, y2                   # y2 = f
    237	xor     a, y1                   # y1 = a ^ (a >> (22-13)
    238	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
    239	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
    240	xor     g, y2                   # y2 = f^g
    241	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
    242	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    243	and     e, y2                   # y2 = (f^g)&e
    244	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
    245	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
    246	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    247	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
    248	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    249	vpxor   XTMP3, XTMP2, XTMP2     #
    250	add     y0, y2                  # y2 = S1 + CH
    251	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    252	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
    253	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
    254	mov     a, y0                   # y0 = a
    255	add     y2, h                   # h = h + S1 + CH + k + w
    256	mov     a, y2                   # y2 = a
    257	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
    258	or      c, y0                   # y0 = a|c
    259	add     h, d                    # d = d + h + S1 + CH + k + w
    260	and     c, y2                   # y2 = a&c
    261	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
    262	and     b, y0                   # y0 = (a|c)&b
    263	add     y1, h                   # h = h + S1 + CH + k + w + S0
    264	## compute high s1
    265	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
    266	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
    267	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
    268	ROTATE_ARGS
    269	mov     e, y0                   # y0 = e
    270	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
    271	mov     a, y1                   # y1 = a
    272	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
    273	xor     e, y0                   # y0 = e ^ (e >> (25-11))
    274	mov     f, y2                   # y2 = f
    275	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
    276	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
    277	xor     a, y1                   # y1 = a ^ (a >> (22-13)
    278	xor     g, y2                   # y2 = f^g
    279	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
    280	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    281	and     e, y2                   # y2 = (f^g)&e
    282	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
    283	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
    284	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    285	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    286	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
    287	vpxor   XTMP3, XTMP2, XTMP2
    288	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    289	add     y0, y2                  # y2 = S1 + CH
    290	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
    291	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
    292	mov     a, y0                   # y0 = a
    293	add     y2, h                   # h = h + S1 + CH + k + w
    294	mov     a, y2                   # y2 = a
    295	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
    296	or      c, y0                   # y0 = a|c
    297	add     h, d                    # d = d + h + S1 + CH + k + w
    298	and     c, y2                   # y2 = a&c
    299	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
    300	and     b, y0                   # y0 = (a|c)&b
    301	add     y1, h                   # h = h + S1 + CH + k + w + S0
    302	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
    303	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
    304	ROTATE_ARGS
    305	rotate_Xs
    306.endm
    307
    308## input is [rsp + _XFER + %1 * 4]
    309.macro DO_ROUND round
    310	mov	e, y0			# y0 = e
    311        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
    312        mov     a, y1                   # y1 = a
    313        xor     e, y0                   # y0 = e ^ (e >> (25-11))
    314        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
    315        mov     f, y2                   # y2 = f
    316        xor     a, y1                   # y1 = a ^ (a >> (22-13)
    317        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
    318        xor     g, y2                   # y2 = f^g
    319        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    320        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
    321        and     e, y2                   # y2 = (f^g)&e
    322        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    323        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    324        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
    325        add     y0, y2                  # y2 = S1 + CH
    326        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    327        offset = \round * 4 + _XFER     #
    328        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
    329        mov     a, y0			# y0 = a
    330        add     y2, h                   # h = h + S1 + CH + k + w
    331        mov     a, y2                   # y2 = a
    332        or      c, y0                   # y0 = a|c
    333        add     h, d                    # d = d + h + S1 + CH + k + w
    334        and     c, y2                   # y2 = a&c
    335        and     b, y0                   # y0 = (a|c)&b
    336        add     y1, h                   # h = h + S1 + CH + k + w + S0
    337        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
    338        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
    339        ROTATE_ARGS
    340.endm
    341
    342########################################################################
    343## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
    344## arg 1 : pointer to state
    345## arg 2 : pointer to input data
    346## arg 3 : Num blocks
    347########################################################################
    348.text
    349SYM_FUNC_START(sha256_transform_avx)
    350.align 32
    351	pushq   %rbx
    352	pushq   %r12
    353	pushq   %r13
    354	pushq   %r14
    355	pushq   %r15
    356	pushq	%rbp
    357	movq	%rsp, %rbp
    358
    359	subq    $STACK_SIZE, %rsp	# allocate stack space
    360	and	$~15, %rsp		# align stack pointer
    361
    362	shl     $6, NUM_BLKS		# convert to bytes
    363	jz      done_hash
    364	add     INP, NUM_BLKS		# pointer to end of data
    365	mov     NUM_BLKS, _INP_END(%rsp)
    366
    367	## load initial digest
    368	mov     4*0(CTX), a
    369	mov     4*1(CTX), b
    370	mov     4*2(CTX), c
    371	mov     4*3(CTX), d
    372	mov     4*4(CTX), e
    373	mov     4*5(CTX), f
    374	mov     4*6(CTX), g
    375	mov     4*7(CTX), h
    376
    377	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
    378	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
    379	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
    380loop0:
    381	lea     K256(%rip), TBL
    382
    383	## byte swap first 16 dwords
    384	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
    385	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
    386	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
    387	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
    388
    389	mov     INP, _INP(%rsp)
    390
    391	## schedule 48 input dwords, by doing 3 rounds of 16 each
    392	mov     $3, SRND
    393.align 16
    394loop1:
    395	vpaddd  (TBL), X0, XFER
    396	vmovdqa XFER, _XFER(%rsp)
    397	FOUR_ROUNDS_AND_SCHED
    398
    399	vpaddd  1*16(TBL), X0, XFER
    400	vmovdqa XFER, _XFER(%rsp)
    401	FOUR_ROUNDS_AND_SCHED
    402
    403	vpaddd  2*16(TBL), X0, XFER
    404	vmovdqa XFER, _XFER(%rsp)
    405	FOUR_ROUNDS_AND_SCHED
    406
    407	vpaddd  3*16(TBL), X0, XFER
    408	vmovdqa XFER, _XFER(%rsp)
    409	add	$4*16, TBL
    410	FOUR_ROUNDS_AND_SCHED
    411
    412	sub     $1, SRND
    413	jne     loop1
    414
    415	mov     $2, SRND
    416loop2:
    417	vpaddd  (TBL), X0, XFER
    418	vmovdqa XFER, _XFER(%rsp)
    419	DO_ROUND        0
    420	DO_ROUND        1
    421	DO_ROUND        2
    422	DO_ROUND        3
    423
    424	vpaddd  1*16(TBL), X1, XFER
    425	vmovdqa XFER, _XFER(%rsp)
    426	add     $2*16, TBL
    427	DO_ROUND        0
    428	DO_ROUND        1
    429	DO_ROUND        2
    430	DO_ROUND        3
    431
    432	vmovdqa X2, X0
    433	vmovdqa X3, X1
    434
    435	sub     $1, SRND
    436	jne     loop2
    437
    438	addm    (4*0)(CTX),a
    439	addm    (4*1)(CTX),b
    440	addm    (4*2)(CTX),c
    441	addm    (4*3)(CTX),d
    442	addm    (4*4)(CTX),e
    443	addm    (4*5)(CTX),f
    444	addm    (4*6)(CTX),g
    445	addm    (4*7)(CTX),h
    446
    447	mov     _INP(%rsp), INP
    448	add     $64, INP
    449	cmp     _INP_END(%rsp), INP
    450	jne     loop0
    451
    452done_hash:
    453
    454	mov	%rbp, %rsp
    455	popq	%rbp
    456	popq    %r15
    457	popq    %r14
    458	popq    %r13
    459	popq	%r12
    460	popq    %rbx
    461	RET
    462SYM_FUNC_END(sha256_transform_avx)
    463
    464.section	.rodata.cst256.K256, "aM", @progbits, 256
    465.align 64
    466K256:
    467	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    468	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    469	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    470	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    471	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    472	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    473	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    474	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    475	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    476	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    477	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    478	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    479	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    480	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    481	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    482	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    483
    484.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
    485.align 16
    486PSHUFFLE_BYTE_FLIP_MASK:
    487	.octa 0x0c0d0e0f08090a0b0405060700010203
    488
    489.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
    490.align 16
    491# shuffle xBxA -> 00BA
    492_SHUF_00BA:
    493	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
    494
    495.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
    496.align 16
    497# shuffle xDxC -> DC00
    498_SHUF_DC00:
    499	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF