aes-spe-core.S (7630B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Fast AES implementation for SPE instruction set (PPC) 4 * 5 * This code makes use of the SPE SIMD instruction set as defined in 6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf 7 * Implementation is based on optimization guide notes from 8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf 9 * 10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> 11 */ 12 13#include <asm/ppc_asm.h> 14#include "aes-spe-regs.h" 15 16#define EAD(in, bpos) \ 17 rlwimi rT0,in,28-((bpos+3)%4)*8,20,27; 18 19#define DAD(in, bpos) \ 20 rlwimi rT1,in,24-((bpos+3)%4)*8,24,31; 21 22#define LWH(out, off) \ 23 evlwwsplat out,off(rT0); /* load word high */ 24 25#define LWL(out, off) \ 26 lwz out,off(rT0); /* load word low */ 27 28#define LBZ(out, tab, off) \ 29 lbz out,off(tab); /* load byte */ 30 31#define LAH(out, in, bpos, off) \ 32 EAD(in, bpos) /* calc addr + load word high */ \ 33 LWH(out, off) 34 35#define LAL(out, in, bpos, off) \ 36 EAD(in, bpos) /* calc addr + load word low */ \ 37 LWL(out, off) 38 39#define LAE(out, in, bpos) \ 40 EAD(in, bpos) /* calc addr + load enc byte */ \ 41 LBZ(out, rT0, 8) 42 43#define LBE(out) \ 44 LBZ(out, rT0, 8) /* load enc byte */ 45 46#define LAD(out, in, bpos) \ 47 DAD(in, bpos) /* calc addr + load dec byte */ \ 48 LBZ(out, rT1, 0) 49 50#define LBD(out) \ 51 LBZ(out, rT1, 0) 52 53/* 54 * ppc_encrypt_block: The central encryption function for a single 16 bytes 55 * block. It does no stack handling or register saving to support fast calls 56 * via bl/blr. It expects that caller has pre-xored input data with first 57 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must 58 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 59 * and rW0-rW3 and caller must execute a final xor on the output registers. 60 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. 61 * 62 */ 63_GLOBAL(ppc_encrypt_block) 64 LAH(rW4, rD1, 2, 4) 65 LAH(rW6, rD0, 3, 0) 66 LAH(rW3, rD0, 1, 8) 67ppc_encrypt_block_loop: 68 LAH(rW0, rD3, 0, 12) 69 LAL(rW0, rD0, 0, 12) 70 LAH(rW1, rD1, 0, 12) 71 LAH(rW2, rD2, 1, 8) 72 LAL(rW2, rD3, 1, 8) 73 LAL(rW3, rD1, 1, 8) 74 LAL(rW4, rD2, 2, 4) 75 LAL(rW6, rD1, 3, 0) 76 LAH(rW5, rD3, 2, 4) 77 LAL(rW5, rD0, 2, 4) 78 LAH(rW7, rD2, 3, 0) 79 evldw rD1,16(rKP) 80 EAD(rD3, 3) 81 evxor rW2,rW2,rW4 82 LWL(rW7, 0) 83 evxor rW2,rW2,rW6 84 EAD(rD2, 0) 85 evxor rD1,rD1,rW2 86 LWL(rW1, 12) 87 evxor rD1,rD1,rW0 88 evldw rD3,24(rKP) 89 evmergehi rD0,rD0,rD1 90 EAD(rD1, 2) 91 evxor rW3,rW3,rW5 92 LWH(rW4, 4) 93 evxor rW3,rW3,rW7 94 EAD(rD0, 3) 95 evxor rD3,rD3,rW3 96 LWH(rW6, 0) 97 evxor rD3,rD3,rW1 98 EAD(rD0, 1) 99 evmergehi rD2,rD2,rD3 100 LWH(rW3, 8) 101 LAH(rW0, rD3, 0, 12) 102 LAL(rW0, rD0, 0, 12) 103 LAH(rW1, rD1, 0, 12) 104 LAH(rW2, rD2, 1, 8) 105 LAL(rW2, rD3, 1, 8) 106 LAL(rW3, rD1, 1, 8) 107 LAL(rW4, rD2, 2, 4) 108 LAL(rW6, rD1, 3, 0) 109 LAH(rW5, rD3, 2, 4) 110 LAL(rW5, rD0, 2, 4) 111 LAH(rW7, rD2, 3, 0) 112 evldw rD1,32(rKP) 113 EAD(rD3, 3) 114 evxor rW2,rW2,rW4 115 LWL(rW7, 0) 116 evxor rW2,rW2,rW6 117 EAD(rD2, 0) 118 evxor rD1,rD1,rW2 119 LWL(rW1, 12) 120 evxor rD1,rD1,rW0 121 evldw rD3,40(rKP) 122 evmergehi rD0,rD0,rD1 123 EAD(rD1, 2) 124 evxor rW3,rW3,rW5 125 LWH(rW4, 4) 126 evxor rW3,rW3,rW7 127 EAD(rD0, 3) 128 evxor rD3,rD3,rW3 129 LWH(rW6, 0) 130 evxor rD3,rD3,rW1 131 EAD(rD0, 1) 132 evmergehi rD2,rD2,rD3 133 LWH(rW3, 8) 134 addi rKP,rKP,32 135 bdnz ppc_encrypt_block_loop 136 LAH(rW0, rD3, 0, 12) 137 LAL(rW0, rD0, 0, 12) 138 LAH(rW1, rD1, 0, 12) 139 LAH(rW2, rD2, 1, 8) 140 LAL(rW2, rD3, 1, 8) 141 LAL(rW3, rD1, 1, 8) 142 LAL(rW4, rD2, 2, 4) 143 LAH(rW5, rD3, 2, 4) 144 LAL(rW6, rD1, 3, 0) 145 LAL(rW5, rD0, 2, 4) 146 LAH(rW7, rD2, 3, 0) 147 evldw rD1,16(rKP) 148 EAD(rD3, 3) 149 evxor rW2,rW2,rW4 150 LWL(rW7, 0) 151 evxor rW2,rW2,rW6 152 EAD(rD2, 0) 153 evxor rD1,rD1,rW2 154 LWL(rW1, 12) 155 evxor rD1,rD1,rW0 156 evldw rD3,24(rKP) 157 evmergehi rD0,rD0,rD1 158 EAD(rD1, 0) 159 evxor rW3,rW3,rW5 160 LBE(rW2) 161 evxor rW3,rW3,rW7 162 EAD(rD0, 1) 163 evxor rD3,rD3,rW3 164 LBE(rW6) 165 evxor rD3,rD3,rW1 166 EAD(rD0, 0) 167 evmergehi rD2,rD2,rD3 168 LBE(rW1) 169 LAE(rW0, rD3, 0) 170 LAE(rW1, rD0, 0) 171 LAE(rW4, rD2, 1) 172 LAE(rW5, rD3, 1) 173 LAE(rW3, rD2, 0) 174 LAE(rW7, rD1, 1) 175 rlwimi rW0,rW4,8,16,23 176 rlwimi rW1,rW5,8,16,23 177 LAE(rW4, rD1, 2) 178 LAE(rW5, rD2, 2) 179 rlwimi rW2,rW6,8,16,23 180 rlwimi rW3,rW7,8,16,23 181 LAE(rW6, rD3, 2) 182 LAE(rW7, rD0, 2) 183 rlwimi rW0,rW4,16,8,15 184 rlwimi rW1,rW5,16,8,15 185 LAE(rW4, rD0, 3) 186 LAE(rW5, rD1, 3) 187 rlwimi rW2,rW6,16,8,15 188 lwz rD0,32(rKP) 189 rlwimi rW3,rW7,16,8,15 190 lwz rD1,36(rKP) 191 LAE(rW6, rD2, 3) 192 LAE(rW7, rD3, 3) 193 rlwimi rW0,rW4,24,0,7 194 lwz rD2,40(rKP) 195 rlwimi rW1,rW5,24,0,7 196 lwz rD3,44(rKP) 197 rlwimi rW2,rW6,24,0,7 198 rlwimi rW3,rW7,24,0,7 199 blr 200 201/* 202 * ppc_decrypt_block: The central decryption function for a single 16 bytes 203 * block. It does no stack handling or register saving to support fast calls 204 * via bl/blr. It expects that caller has pre-xored input data with first 205 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must 206 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 207 * and rW0-rW3 and caller must execute a final xor on the output registers. 208 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. 209 * 210 */ 211_GLOBAL(ppc_decrypt_block) 212 LAH(rW0, rD1, 0, 12) 213 LAH(rW6, rD0, 3, 0) 214 LAH(rW3, rD0, 1, 8) 215ppc_decrypt_block_loop: 216 LAH(rW1, rD3, 0, 12) 217 LAL(rW0, rD2, 0, 12) 218 LAH(rW2, rD2, 1, 8) 219 LAL(rW2, rD3, 1, 8) 220 LAH(rW4, rD3, 2, 4) 221 LAL(rW4, rD0, 2, 4) 222 LAL(rW6, rD1, 3, 0) 223 LAH(rW5, rD1, 2, 4) 224 LAH(rW7, rD2, 3, 0) 225 LAL(rW7, rD3, 3, 0) 226 LAL(rW3, rD1, 1, 8) 227 evldw rD1,16(rKP) 228 EAD(rD0, 0) 229 evxor rW4,rW4,rW6 230 LWL(rW1, 12) 231 evxor rW0,rW0,rW4 232 EAD(rD2, 2) 233 evxor rW0,rW0,rW2 234 LWL(rW5, 4) 235 evxor rD1,rD1,rW0 236 evldw rD3,24(rKP) 237 evmergehi rD0,rD0,rD1 238 EAD(rD1, 0) 239 evxor rW3,rW3,rW7 240 LWH(rW0, 12) 241 evxor rW3,rW3,rW1 242 EAD(rD0, 3) 243 evxor rD3,rD3,rW3 244 LWH(rW6, 0) 245 evxor rD3,rD3,rW5 246 EAD(rD0, 1) 247 evmergehi rD2,rD2,rD3 248 LWH(rW3, 8) 249 LAH(rW1, rD3, 0, 12) 250 LAL(rW0, rD2, 0, 12) 251 LAH(rW2, rD2, 1, 8) 252 LAL(rW2, rD3, 1, 8) 253 LAH(rW4, rD3, 2, 4) 254 LAL(rW4, rD0, 2, 4) 255 LAL(rW6, rD1, 3, 0) 256 LAH(rW5, rD1, 2, 4) 257 LAH(rW7, rD2, 3, 0) 258 LAL(rW7, rD3, 3, 0) 259 LAL(rW3, rD1, 1, 8) 260 evldw rD1,32(rKP) 261 EAD(rD0, 0) 262 evxor rW4,rW4,rW6 263 LWL(rW1, 12) 264 evxor rW0,rW0,rW4 265 EAD(rD2, 2) 266 evxor rW0,rW0,rW2 267 LWL(rW5, 4) 268 evxor rD1,rD1,rW0 269 evldw rD3,40(rKP) 270 evmergehi rD0,rD0,rD1 271 EAD(rD1, 0) 272 evxor rW3,rW3,rW7 273 LWH(rW0, 12) 274 evxor rW3,rW3,rW1 275 EAD(rD0, 3) 276 evxor rD3,rD3,rW3 277 LWH(rW6, 0) 278 evxor rD3,rD3,rW5 279 EAD(rD0, 1) 280 evmergehi rD2,rD2,rD3 281 LWH(rW3, 8) 282 addi rKP,rKP,32 283 bdnz ppc_decrypt_block_loop 284 LAH(rW1, rD3, 0, 12) 285 LAL(rW0, rD2, 0, 12) 286 LAH(rW2, rD2, 1, 8) 287 LAL(rW2, rD3, 1, 8) 288 LAH(rW4, rD3, 2, 4) 289 LAL(rW4, rD0, 2, 4) 290 LAL(rW6, rD1, 3, 0) 291 LAH(rW5, rD1, 2, 4) 292 LAH(rW7, rD2, 3, 0) 293 LAL(rW7, rD3, 3, 0) 294 LAL(rW3, rD1, 1, 8) 295 evldw rD1,16(rKP) 296 EAD(rD0, 0) 297 evxor rW4,rW4,rW6 298 LWL(rW1, 12) 299 evxor rW0,rW0,rW4 300 EAD(rD2, 2) 301 evxor rW0,rW0,rW2 302 LWL(rW5, 4) 303 evxor rD1,rD1,rW0 304 evldw rD3,24(rKP) 305 evmergehi rD0,rD0,rD1 306 DAD(rD1, 0) 307 evxor rW3,rW3,rW7 308 LBD(rW0) 309 evxor rW3,rW3,rW1 310 DAD(rD0, 1) 311 evxor rD3,rD3,rW3 312 LBD(rW6) 313 evxor rD3,rD3,rW5 314 DAD(rD0, 0) 315 evmergehi rD2,rD2,rD3 316 LBD(rW3) 317 LAD(rW2, rD3, 0) 318 LAD(rW1, rD2, 0) 319 LAD(rW4, rD2, 1) 320 LAD(rW5, rD3, 1) 321 LAD(rW7, rD1, 1) 322 rlwimi rW0,rW4,8,16,23 323 rlwimi rW1,rW5,8,16,23 324 LAD(rW4, rD3, 2) 325 LAD(rW5, rD0, 2) 326 rlwimi rW2,rW6,8,16,23 327 rlwimi rW3,rW7,8,16,23 328 LAD(rW6, rD1, 2) 329 LAD(rW7, rD2, 2) 330 rlwimi rW0,rW4,16,8,15 331 rlwimi rW1,rW5,16,8,15 332 LAD(rW4, rD0, 3) 333 LAD(rW5, rD1, 3) 334 rlwimi rW2,rW6,16,8,15 335 lwz rD0,32(rKP) 336 rlwimi rW3,rW7,16,8,15 337 lwz rD1,36(rKP) 338 LAD(rW6, rD2, 3) 339 LAD(rW7, rD3, 3) 340 rlwimi rW0,rW4,24,0,7 341 lwz rD2,40(rKP) 342 rlwimi rW1,rW5,24,0,7 343 lwz rD3,44(rKP) 344 rlwimi rW2,rW6,24,0,7 345 rlwimi rW3,rW7,24,0,7 346 blr