cast6-avx-x86_64-asm_64.S (9044B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/frame.h> 13#include "glue_helper-asm-avx.S" 14 15.file "cast6-avx-x86_64-asm_64.S" 16 17.extern cast_s1 18.extern cast_s2 19.extern cast_s3 20.extern cast_s4 21 22/* structure of crypto context */ 23#define km 0 24#define kr (12*4*4) 25 26/* s-boxes */ 27#define s1 cast_s1 28#define s2 cast_s2 29#define s3 cast_s3 30#define s4 cast_s4 31 32/********************************************************************** 33 8-way AVX cast6 34 **********************************************************************/ 35#define CTX %r15 36 37#define RA1 %xmm0 38#define RB1 %xmm1 39#define RC1 %xmm2 40#define RD1 %xmm3 41 42#define RA2 %xmm4 43#define RB2 %xmm5 44#define RC2 %xmm6 45#define RD2 %xmm7 46 47#define RX %xmm8 48 49#define RKM %xmm9 50#define RKR %xmm10 51#define RKRF %xmm11 52#define RKRR %xmm12 53#define R32 %xmm13 54#define R1ST %xmm14 55 56#define RTMP %xmm15 57 58#define RID1 %rdi 59#define RID1d %edi 60#define RID2 %rsi 61#define RID2d %esi 62 63#define RGI1 %rdx 64#define RGI1bl %dl 65#define RGI1bh %dh 66#define RGI2 %rcx 67#define RGI2bl %cl 68#define RGI2bh %ch 69 70#define RGI3 %rax 71#define RGI3bl %al 72#define RGI3bh %ah 73#define RGI4 %rbx 74#define RGI4bl %bl 75#define RGI4bh %bh 76 77#define RFS1 %r8 78#define RFS1d %r8d 79#define RFS2 %r9 80#define RFS2d %r9d 81#define RFS3 %r10 82#define RFS3d %r10d 83 84 85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 86 movzbl src ## bh, RID1d; \ 87 movzbl src ## bl, RID2d; \ 88 shrq $16, src; \ 89 movl s1(, RID1, 4), dst ## d; \ 90 op1 s2(, RID2, 4), dst ## d; \ 91 movzbl src ## bh, RID1d; \ 92 movzbl src ## bl, RID2d; \ 93 interleave_op(il_reg); \ 94 op2 s3(, RID1, 4), dst ## d; \ 95 op3 s4(, RID2, 4), dst ## d; 96 97#define dummy(d) /* do nothing */ 98 99#define shr_next(reg) \ 100 shrq $16, reg; 101 102#define F_head(a, x, gi1, gi2, op0) \ 103 op0 a, RKM, x; \ 104 vpslld RKRF, x, RTMP; \ 105 vpsrld RKRR, x, x; \ 106 vpor RTMP, x, x; \ 107 \ 108 vmovq x, gi1; \ 109 vpextrq $1, x, gi2; 110 111#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 114 \ 115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 116 shlq $32, RFS2; \ 117 orq RFS1, RFS2; \ 118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 119 shlq $32, RFS1; \ 120 orq RFS1, RFS3; \ 121 \ 122 vmovq RFS2, x; \ 123 vpinsrq $1, RFS3, x, x; 124 125#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 126 F_head(b1, RX, RGI1, RGI2, op0); \ 127 F_head(b2, RX, RGI3, RGI4, op0); \ 128 \ 129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 131 \ 132 vpxor a1, RX, a1; \ 133 vpxor a2, RTMP, a2; 134 135#define F1_2(a1, b1, a2, b2) \ 136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 137#define F2_2(a1, b1, a2, b2) \ 138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 139#define F3_2(a1, b1, a2, b2) \ 140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 141 142#define qop(in, out, f) \ 143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); 144 145#define get_round_keys(nn) \ 146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \ 147 vpand R1ST, RKR, RKRF; \ 148 vpsubq RKRF, R32, RKRR; \ 149 vpsrldq $1, RKR, RKR; 150 151#define Q(n) \ 152 get_round_keys(4*n+0); \ 153 qop(RD, RC, 1); \ 154 \ 155 get_round_keys(4*n+1); \ 156 qop(RC, RB, 2); \ 157 \ 158 get_round_keys(4*n+2); \ 159 qop(RB, RA, 3); \ 160 \ 161 get_round_keys(4*n+3); \ 162 qop(RA, RD, 1); 163 164#define QBAR(n) \ 165 get_round_keys(4*n+3); \ 166 qop(RA, RD, 1); \ 167 \ 168 get_round_keys(4*n+2); \ 169 qop(RB, RA, 3); \ 170 \ 171 get_round_keys(4*n+1); \ 172 qop(RC, RB, 2); \ 173 \ 174 get_round_keys(4*n+0); \ 175 qop(RD, RC, 1); 176 177#define shuffle(mask) \ 178 vpshufb mask, RKR, RKR; 179 180#define preload_rkr(n, do_mask, mask) \ 181 vbroadcastss .L16_mask, RKR; \ 182 /* add 16-bit rotation to key rotations (mod 32) */ \ 183 vpxor (kr+n*16)(CTX), RKR, RKR; \ 184 do_mask(mask); 185 186#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 187 vpunpckldq x1, x0, t0; \ 188 vpunpckhdq x1, x0, t2; \ 189 vpunpckldq x3, x2, t1; \ 190 vpunpckhdq x3, x2, x3; \ 191 \ 192 vpunpcklqdq t1, t0, x0; \ 193 vpunpckhqdq t1, t0, x1; \ 194 vpunpcklqdq x3, t2, x2; \ 195 vpunpckhqdq x3, t2, x3; 196 197#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 198 vpshufb rmask, x0, x0; \ 199 vpshufb rmask, x1, x1; \ 200 vpshufb rmask, x2, x2; \ 201 vpshufb rmask, x3, x3; \ 202 \ 203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 204 205#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 207 \ 208 vpshufb rmask, x0, x0; \ 209 vpshufb rmask, x1, x1; \ 210 vpshufb rmask, x2, x2; \ 211 vpshufb rmask, x3, x3; 212 213.section .rodata.cst16, "aM", @progbits, 16 214.align 16 215.Lbswap_mask: 216 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 217.Lbswap128_mask: 218 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 219.Lrkr_enc_Q_Q_QBAR_QBAR: 220 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 221.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 222 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 223.Lrkr_dec_Q_Q_Q_Q: 224 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 225.Lrkr_dec_Q_Q_QBAR_QBAR: 226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 227.Lrkr_dec_QBAR_QBAR_QBAR_QBAR: 228 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 229 230.section .rodata.cst4.L16_mask, "aM", @progbits, 4 231.align 4 232.L16_mask: 233 .byte 16, 16, 16, 16 234 235.section .rodata.cst4.L32_mask, "aM", @progbits, 4 236.align 4 237.L32_mask: 238 .byte 32, 0, 0, 0 239 240.section .rodata.cst4.first_mask, "aM", @progbits, 4 241.align 4 242.Lfirst_mask: 243 .byte 0x1f, 0, 0, 0 244 245.text 246 247.align 8 248SYM_FUNC_START_LOCAL(__cast6_enc_blk8) 249 /* input: 250 * %rdi: ctx 251 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 252 * output: 253 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 254 */ 255 256 pushq %r15; 257 pushq %rbx; 258 259 movq %rdi, CTX; 260 261 vmovdqa .Lbswap_mask, RKM; 262 vmovd .Lfirst_mask, R1ST; 263 vmovd .L32_mask, R32; 264 265 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 266 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 267 268 preload_rkr(0, dummy, none); 269 Q(0); 270 Q(1); 271 Q(2); 272 Q(3); 273 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); 274 Q(4); 275 Q(5); 276 QBAR(6); 277 QBAR(7); 278 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); 279 QBAR(8); 280 QBAR(9); 281 QBAR(10); 282 QBAR(11); 283 284 popq %rbx; 285 popq %r15; 286 287 vmovdqa .Lbswap_mask, RKM; 288 289 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 290 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 291 292 RET; 293SYM_FUNC_END(__cast6_enc_blk8) 294 295.align 8 296SYM_FUNC_START_LOCAL(__cast6_dec_blk8) 297 /* input: 298 * %rdi: ctx 299 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 300 * output: 301 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 302 */ 303 304 pushq %r15; 305 pushq %rbx; 306 307 movq %rdi, CTX; 308 309 vmovdqa .Lbswap_mask, RKM; 310 vmovd .Lfirst_mask, R1ST; 311 vmovd .L32_mask, R32; 312 313 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 314 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 315 316 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 317 Q(11); 318 Q(10); 319 Q(9); 320 Q(8); 321 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); 322 Q(7); 323 Q(6); 324 QBAR(5); 325 QBAR(4); 326 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); 327 QBAR(3); 328 QBAR(2); 329 QBAR(1); 330 QBAR(0); 331 332 popq %rbx; 333 popq %r15; 334 335 vmovdqa .Lbswap_mask, RKM; 336 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 337 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 338 339 RET; 340SYM_FUNC_END(__cast6_dec_blk8) 341 342SYM_FUNC_START(cast6_ecb_enc_8way) 343 /* input: 344 * %rdi: ctx 345 * %rsi: dst 346 * %rdx: src 347 */ 348 FRAME_BEGIN 349 pushq %r15; 350 351 movq %rdi, CTX; 352 movq %rsi, %r11; 353 354 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 355 356 call __cast6_enc_blk8; 357 358 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 359 360 popq %r15; 361 FRAME_END 362 RET; 363SYM_FUNC_END(cast6_ecb_enc_8way) 364 365SYM_FUNC_START(cast6_ecb_dec_8way) 366 /* input: 367 * %rdi: ctx 368 * %rsi: dst 369 * %rdx: src 370 */ 371 FRAME_BEGIN 372 pushq %r15; 373 374 movq %rdi, CTX; 375 movq %rsi, %r11; 376 377 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 378 379 call __cast6_dec_blk8; 380 381 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 382 383 popq %r15; 384 FRAME_END 385 RET; 386SYM_FUNC_END(cast6_ecb_dec_8way) 387 388SYM_FUNC_START(cast6_cbc_dec_8way) 389 /* input: 390 * %rdi: ctx 391 * %rsi: dst 392 * %rdx: src 393 */ 394 FRAME_BEGIN 395 pushq %r12; 396 pushq %r15; 397 398 movq %rdi, CTX; 399 movq %rsi, %r11; 400 movq %rdx, %r12; 401 402 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 403 404 call __cast6_dec_blk8; 405 406 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 407 408 popq %r15; 409 popq %r12; 410 FRAME_END 411 RET; 412SYM_FUNC_END(cast6_cbc_dec_8way)