sha1-spe-asm.S (10069B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Fast SHA-1 implementation for SPE instruction set (PPC) 4 * 5 * This code makes use of the SPE SIMD instruction set as defined in 6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf 7 * Implementation is based on optimization guide notes from 8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf 9 * 10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> 11 */ 12 13#include <asm/ppc_asm.h> 14#include <asm/asm-offsets.h> 15 16#define rHP r3 /* pointer to hash value */ 17#define rWP r4 /* pointer to input */ 18#define rKP r5 /* pointer to constants */ 19 20#define rW0 r14 /* 64 bit round words */ 21#define rW1 r15 22#define rW2 r16 23#define rW3 r17 24#define rW4 r18 25#define rW5 r19 26#define rW6 r20 27#define rW7 r21 28 29#define rH0 r6 /* 32 bit hash values */ 30#define rH1 r7 31#define rH2 r8 32#define rH3 r9 33#define rH4 r10 34 35#define rT0 r22 /* 64 bit temporary */ 36#define rT1 r0 /* 32 bit temporaries */ 37#define rT2 r11 38#define rT3 r12 39 40#define rK r23 /* 64 bit constant in volatile register */ 41 42#define LOAD_K01 43 44#define LOAD_K11 \ 45 evlwwsplat rK,0(rKP); 46 47#define LOAD_K21 \ 48 evlwwsplat rK,4(rKP); 49 50#define LOAD_K31 \ 51 evlwwsplat rK,8(rKP); 52 53#define LOAD_K41 \ 54 evlwwsplat rK,12(rKP); 55 56#define INITIALIZE \ 57 stwu r1,-128(r1); /* create stack frame */ \ 58 evstdw r14,8(r1); /* We must save non volatile */ \ 59 evstdw r15,16(r1); /* registers. Take the chance */ \ 60 evstdw r16,24(r1); /* and save the SPE part too */ \ 61 evstdw r17,32(r1); \ 62 evstdw r18,40(r1); \ 63 evstdw r19,48(r1); \ 64 evstdw r20,56(r1); \ 65 evstdw r21,64(r1); \ 66 evstdw r22,72(r1); \ 67 evstdw r23,80(r1); 68 69 70#define FINALIZE \ 71 evldw r14,8(r1); /* restore SPE registers */ \ 72 evldw r15,16(r1); \ 73 evldw r16,24(r1); \ 74 evldw r17,32(r1); \ 75 evldw r18,40(r1); \ 76 evldw r19,48(r1); \ 77 evldw r20,56(r1); \ 78 evldw r21,64(r1); \ 79 evldw r22,72(r1); \ 80 evldw r23,80(r1); \ 81 xor r0,r0,r0; \ 82 stw r0,8(r1); /* Delete sensitive data */ \ 83 stw r0,16(r1); /* that we might have pushed */ \ 84 stw r0,24(r1); /* from other context that runs */ \ 85 stw r0,32(r1); /* the same code. Assume that */ \ 86 stw r0,40(r1); /* the lower part of the GPRs */ \ 87 stw r0,48(r1); /* were already overwritten on */ \ 88 stw r0,56(r1); /* the way down to here */ \ 89 stw r0,64(r1); \ 90 stw r0,72(r1); \ 91 stw r0,80(r1); \ 92 addi r1,r1,128; /* cleanup stack frame */ 93 94#ifdef __BIG_ENDIAN__ 95#define LOAD_DATA(reg, off) \ 96 lwz reg,off(rWP); /* load data */ 97#define NEXT_BLOCK \ 98 addi rWP,rWP,64; /* increment per block */ 99#else 100#define LOAD_DATA(reg, off) \ 101 lwbrx reg,0,rWP; /* load data */ \ 102 addi rWP,rWP,4; /* increment per word */ 103#define NEXT_BLOCK /* nothing to do */ 104#endif 105 106#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ 107 LOAD_DATA(w0, off) /* 1: W */ \ 108 and rT2,b,c; /* 1: F' = B and C */ \ 109 LOAD_K##k##1 \ 110 andc rT1,d,b; /* 1: F" = ~B and D */ \ 111 rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ 112 or rT2,rT2,rT1; /* 1: F = F' or F" */ \ 113 add e,e,rT0; /* 1: E = E + A' */ \ 114 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 115 add e,e,w0; /* 1: E = E + W */ \ 116 LOAD_DATA(w1, off+4) /* 2: W */ \ 117 add e,e,rT2; /* 1: E = E + F */ \ 118 and rT1,a,b; /* 2: F' = B and C */ \ 119 add e,e,rK; /* 1: E = E + K */ \ 120 andc rT2,c,a; /* 2: F" = ~B and D */ \ 121 add d,d,rK; /* 2: E = E + K */ \ 122 or rT2,rT2,rT1; /* 2: F = F' or F" */ \ 123 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 124 add d,d,w1; /* 2: E = E + W */ \ 125 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 126 add d,d,rT0; /* 2: E = E + A' */ \ 127 evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ 128 add d,d,rT2 /* 2: E = E + F */ 129 130#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 131 and rT2,b,c; /* 1: F' = B and C */ \ 132 evmergelohi rT0,w7,w6; /* W[-3] */ \ 133 andc rT1,d,b; /* 1: F" = ~B and D */ \ 134 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ 135 or rT1,rT1,rT2; /* 1: F = F' or F" */ \ 136 evxor w0,w0,w4; /* W = W xor W[-8] */ \ 137 add e,e,rT1; /* 1: E = E + F */ \ 138 evxor w0,w0,w1; /* W = W xor W[-14] */ \ 139 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ 140 evrlwi w0,w0,1; /* W = W rotl 1 */ \ 141 add e,e,rT2; /* 1: E = E + A' */ \ 142 evaddw rT0,w0,rK; /* WK = W + K */ \ 143 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 144 LOAD_K##k##1 \ 145 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ 146 add e,e,rT0; /* 1: E = E + WK */ \ 147 add d,d,rT1; /* 2: E = E + WK */ \ 148 and rT2,a,b; /* 2: F' = B and C */ \ 149 andc rT1,c,a; /* 2: F" = ~B and D */ \ 150 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 151 or rT1,rT1,rT2; /* 2: F = F' or F" */ \ 152 add d,d,rT0; /* 2: E = E + A' */ \ 153 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 154 add d,d,rT1 /* 2: E = E + F */ 155 156#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 157 evmergelohi rT0,w7,w6; /* W[-3] */ \ 158 xor rT2,b,c; /* 1: F' = B xor C */ \ 159 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ 160 xor rT2,rT2,d; /* 1: F = F' xor D */ \ 161 evxor w0,w0,w4; /* W = W xor W[-8] */ \ 162 add e,e,rT2; /* 1: E = E + F */ \ 163 evxor w0,w0,w1; /* W = W xor W[-14] */ \ 164 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ 165 evrlwi w0,w0,1; /* W = W rotl 1 */ \ 166 add e,e,rT2; /* 1: E = E + A' */ \ 167 evaddw rT0,w0,rK; /* WK = W + K */ \ 168 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 169 LOAD_K##k##1 \ 170 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ 171 add e,e,rT0; /* 1: E = E + WK */ \ 172 xor rT2,a,b; /* 2: F' = B xor C */ \ 173 add d,d,rT1; /* 2: E = E + WK */ \ 174 xor rT2,rT2,c; /* 2: F = F' xor D */ \ 175 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 176 add d,d,rT2; /* 2: E = E + F */ \ 177 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 178 add d,d,rT0 /* 2: E = E + A' */ 179 180#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 181 and rT2,b,c; /* 1: F' = B and C */ \ 182 evmergelohi rT0,w7,w6; /* W[-3] */ \ 183 or rT1,b,c; /* 1: F" = B or C */ \ 184 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ 185 and rT1,d,rT1; /* 1: F" = F" and D */ \ 186 evxor w0,w0,w4; /* W = W xor W[-8] */ \ 187 or rT2,rT2,rT1; /* 1: F = F' or F" */ \ 188 evxor w0,w0,w1; /* W = W xor W[-14] */ \ 189 add e,e,rT2; /* 1: E = E + F */ \ 190 evrlwi w0,w0,1; /* W = W rotl 1 */ \ 191 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ 192 evaddw rT0,w0,rK; /* WK = W + K */ \ 193 add e,e,rT2; /* 1: E = E + A' */ \ 194 LOAD_K##k##1 \ 195 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ 196 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 197 add e,e,rT0; /* 1: E = E + WK */ \ 198 and rT2,a,b; /* 2: F' = B and C */ \ 199 or rT0,a,b; /* 2: F" = B or C */ \ 200 add d,d,rT1; /* 2: E = E + WK */ \ 201 and rT0,c,rT0; /* 2: F" = F" and D */ \ 202 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 203 or rT2,rT2,rT0; /* 2: F = F' or F" */ \ 204 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 205 add d,d,rT2; /* 2: E = E + F */ \ 206 add d,d,rT0 /* 2: E = E + A' */ 207 208#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 209 R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) 210 211_GLOBAL(ppc_spe_sha1_transform) 212 INITIALIZE 213 214 lwz rH0,0(rHP) 215 lwz rH1,4(rHP) 216 mtctr r5 217 lwz rH2,8(rHP) 218 lis rKP,PPC_SPE_SHA1_K@h 219 lwz rH3,12(rHP) 220 ori rKP,rKP,PPC_SPE_SHA1_K@l 221 lwz rH4,16(rHP) 222 223ppc_spe_sha1_main: 224 R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) 225 R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) 226 R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) 227 R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) 228 R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) 229 R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) 230 R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) 231 R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) 232 233 R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) 234 R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) 235 236 R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) 237 R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) 238 R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) 239 R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) 240 R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) 241 R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) 242 R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) 243 R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) 244 R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) 245 R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) 246 247 R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) 248 R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) 249 R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) 250 R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) 251 R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) 252 R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) 253 R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) 254 R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) 255 R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) 256 R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) 257 258 R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) 259 R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) 260 R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) 261 R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) 262 R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) 263 R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) 264 R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) 265 lwz rT3,0(rHP) 266 R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) 267 lwz rW1,4(rHP) 268 R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) 269 lwz rW2,8(rHP) 270 R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) 271 lwz rW3,12(rHP) 272 NEXT_BLOCK 273 lwz rW4,16(rHP) 274 275 add rH0,rH0,rT3 276 stw rH0,0(rHP) 277 add rH1,rH1,rW1 278 stw rH1,4(rHP) 279 add rH2,rH2,rW2 280 stw rH2,8(rHP) 281 add rH3,rH3,rW3 282 stw rH3,12(rHP) 283 add rH4,rH4,rW4 284 stw rH4,16(rHP) 285 286 bdnz ppc_spe_sha1_main 287 288 FINALIZE 289 blr 290 291.data 292.align 4 293PPC_SPE_SHA1_K: 294 .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6