blowfish-x86_64-asm_64.S (5954B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Blowfish Cipher Algorithm (x86_64) 4 * 5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9 10.file "blowfish-x86_64-asm.S" 11.text 12 13/* structure of crypto context */ 14#define p 0 15#define s0 ((16 + 2) * 4) 16#define s1 ((16 + 2 + (1 * 256)) * 4) 17#define s2 ((16 + 2 + (2 * 256)) * 4) 18#define s3 ((16 + 2 + (3 * 256)) * 4) 19 20/* register macros */ 21#define CTX %r12 22#define RIO %rsi 23 24#define RX0 %rax 25#define RX1 %rbx 26#define RX2 %rcx 27#define RX3 %rdx 28 29#define RX0d %eax 30#define RX1d %ebx 31#define RX2d %ecx 32#define RX3d %edx 33 34#define RX0bl %al 35#define RX1bl %bl 36#define RX2bl %cl 37#define RX3bl %dl 38 39#define RX0bh %ah 40#define RX1bh %bh 41#define RX2bh %ch 42#define RX3bh %dh 43 44#define RT0 %rdi 45#define RT1 %rsi 46#define RT2 %r8 47#define RT3 %r9 48 49#define RT0d %edi 50#define RT1d %esi 51#define RT2d %r8d 52#define RT3d %r9d 53 54#define RKEY %r10 55 56/*********************************************************************** 57 * 1-way blowfish 58 ***********************************************************************/ 59#define F() \ 60 rorq $16, RX0; \ 61 movzbl RX0bh, RT0d; \ 62 movzbl RX0bl, RT1d; \ 63 rolq $16, RX0; \ 64 movl s0(CTX,RT0,4), RT0d; \ 65 addl s1(CTX,RT1,4), RT0d; \ 66 movzbl RX0bh, RT1d; \ 67 movzbl RX0bl, RT2d; \ 68 rolq $32, RX0; \ 69 xorl s2(CTX,RT1,4), RT0d; \ 70 addl s3(CTX,RT2,4), RT0d; \ 71 xorq RT0, RX0; 72 73#define add_roundkey_enc(n) \ 74 xorq p+4*(n)(CTX), RX0; 75 76#define round_enc(n) \ 77 add_roundkey_enc(n); \ 78 \ 79 F(); \ 80 F(); 81 82#define add_roundkey_dec(n) \ 83 movq p+4*(n-1)(CTX), RT0; \ 84 rorq $32, RT0; \ 85 xorq RT0, RX0; 86 87#define round_dec(n) \ 88 add_roundkey_dec(n); \ 89 \ 90 F(); \ 91 F(); \ 92 93#define read_block() \ 94 movq (RIO), RX0; \ 95 rorq $32, RX0; \ 96 bswapq RX0; 97 98#define write_block() \ 99 bswapq RX0; \ 100 movq RX0, (RIO); 101 102#define xor_block() \ 103 bswapq RX0; \ 104 xorq RX0, (RIO); 105 106SYM_FUNC_START(__blowfish_enc_blk) 107 /* input: 108 * %rdi: ctx 109 * %rsi: dst 110 * %rdx: src 111 * %rcx: bool, if true: xor output 112 */ 113 movq %r12, %r11; 114 115 movq %rdi, CTX; 116 movq %rsi, %r10; 117 movq %rdx, RIO; 118 119 read_block(); 120 121 round_enc(0); 122 round_enc(2); 123 round_enc(4); 124 round_enc(6); 125 round_enc(8); 126 round_enc(10); 127 round_enc(12); 128 round_enc(14); 129 add_roundkey_enc(16); 130 131 movq %r11, %r12; 132 133 movq %r10, RIO; 134 test %cl, %cl; 135 jnz .L__enc_xor; 136 137 write_block(); 138 RET; 139.L__enc_xor: 140 xor_block(); 141 RET; 142SYM_FUNC_END(__blowfish_enc_blk) 143 144SYM_FUNC_START(blowfish_dec_blk) 145 /* input: 146 * %rdi: ctx 147 * %rsi: dst 148 * %rdx: src 149 */ 150 movq %r12, %r11; 151 152 movq %rdi, CTX; 153 movq %rsi, %r10; 154 movq %rdx, RIO; 155 156 read_block(); 157 158 round_dec(17); 159 round_dec(15); 160 round_dec(13); 161 round_dec(11); 162 round_dec(9); 163 round_dec(7); 164 round_dec(5); 165 round_dec(3); 166 add_roundkey_dec(1); 167 168 movq %r10, RIO; 169 write_block(); 170 171 movq %r11, %r12; 172 173 RET; 174SYM_FUNC_END(blowfish_dec_blk) 175 176/********************************************************************** 177 4-way blowfish, four blocks parallel 178 **********************************************************************/ 179 180/* F() for 4-way. Slower when used alone/1-way, but faster when used 181 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 182 */ 183#define F4(x) \ 184 movzbl x ## bh, RT1d; \ 185 movzbl x ## bl, RT3d; \ 186 rorq $16, x; \ 187 movzbl x ## bh, RT0d; \ 188 movzbl x ## bl, RT2d; \ 189 rorq $16, x; \ 190 movl s0(CTX,RT0,4), RT0d; \ 191 addl s1(CTX,RT2,4), RT0d; \ 192 xorl s2(CTX,RT1,4), RT0d; \ 193 addl s3(CTX,RT3,4), RT0d; \ 194 xorq RT0, x; 195 196#define add_preloaded_roundkey4() \ 197 xorq RKEY, RX0; \ 198 xorq RKEY, RX1; \ 199 xorq RKEY, RX2; \ 200 xorq RKEY, RX3; 201 202#define preload_roundkey_enc(n) \ 203 movq p+4*(n)(CTX), RKEY; 204 205#define add_roundkey_enc4(n) \ 206 add_preloaded_roundkey4(); \ 207 preload_roundkey_enc(n + 2); 208 209#define round_enc4(n) \ 210 add_roundkey_enc4(n); \ 211 \ 212 F4(RX0); \ 213 F4(RX1); \ 214 F4(RX2); \ 215 F4(RX3); \ 216 \ 217 F4(RX0); \ 218 F4(RX1); \ 219 F4(RX2); \ 220 F4(RX3); 221 222#define preload_roundkey_dec(n) \ 223 movq p+4*((n)-1)(CTX), RKEY; \ 224 rorq $32, RKEY; 225 226#define add_roundkey_dec4(n) \ 227 add_preloaded_roundkey4(); \ 228 preload_roundkey_dec(n - 2); 229 230#define round_dec4(n) \ 231 add_roundkey_dec4(n); \ 232 \ 233 F4(RX0); \ 234 F4(RX1); \ 235 F4(RX2); \ 236 F4(RX3); \ 237 \ 238 F4(RX0); \ 239 F4(RX1); \ 240 F4(RX2); \ 241 F4(RX3); 242 243#define read_block4() \ 244 movq (RIO), RX0; \ 245 rorq $32, RX0; \ 246 bswapq RX0; \ 247 \ 248 movq 8(RIO), RX1; \ 249 rorq $32, RX1; \ 250 bswapq RX1; \ 251 \ 252 movq 16(RIO), RX2; \ 253 rorq $32, RX2; \ 254 bswapq RX2; \ 255 \ 256 movq 24(RIO), RX3; \ 257 rorq $32, RX3; \ 258 bswapq RX3; 259 260#define write_block4() \ 261 bswapq RX0; \ 262 movq RX0, (RIO); \ 263 \ 264 bswapq RX1; \ 265 movq RX1, 8(RIO); \ 266 \ 267 bswapq RX2; \ 268 movq RX2, 16(RIO); \ 269 \ 270 bswapq RX3; \ 271 movq RX3, 24(RIO); 272 273#define xor_block4() \ 274 bswapq RX0; \ 275 xorq RX0, (RIO); \ 276 \ 277 bswapq RX1; \ 278 xorq RX1, 8(RIO); \ 279 \ 280 bswapq RX2; \ 281 xorq RX2, 16(RIO); \ 282 \ 283 bswapq RX3; \ 284 xorq RX3, 24(RIO); 285 286SYM_FUNC_START(__blowfish_enc_blk_4way) 287 /* input: 288 * %rdi: ctx 289 * %rsi: dst 290 * %rdx: src 291 * %rcx: bool, if true: xor output 292 */ 293 pushq %r12; 294 pushq %rbx; 295 pushq %rcx; 296 297 movq %rdi, CTX 298 movq %rsi, %r11; 299 movq %rdx, RIO; 300 301 preload_roundkey_enc(0); 302 303 read_block4(); 304 305 round_enc4(0); 306 round_enc4(2); 307 round_enc4(4); 308 round_enc4(6); 309 round_enc4(8); 310 round_enc4(10); 311 round_enc4(12); 312 round_enc4(14); 313 add_preloaded_roundkey4(); 314 315 popq %r12; 316 movq %r11, RIO; 317 318 test %r12b, %r12b; 319 jnz .L__enc_xor4; 320 321 write_block4(); 322 323 popq %rbx; 324 popq %r12; 325 RET; 326 327.L__enc_xor4: 328 xor_block4(); 329 330 popq %rbx; 331 popq %r12; 332 RET; 333SYM_FUNC_END(__blowfish_enc_blk_4way) 334 335SYM_FUNC_START(blowfish_dec_blk_4way) 336 /* input: 337 * %rdi: ctx 338 * %rsi: dst 339 * %rdx: src 340 */ 341 pushq %r12; 342 pushq %rbx; 343 344 movq %rdi, CTX; 345 movq %rsi, %r11 346 movq %rdx, RIO; 347 348 preload_roundkey_dec(17); 349 read_block4(); 350 351 round_dec4(17); 352 round_dec4(15); 353 round_dec4(13); 354 round_dec4(11); 355 round_dec4(9); 356 round_dec4(7); 357 round_dec4(5); 358 round_dec4(3); 359 add_preloaded_roundkey4(); 360 361 movq %r11, RIO; 362 write_block4(); 363 364 popq %rbx; 365 popq %r12; 366 367 RET; 368SYM_FUNC_END(blowfish_dec_blk_4way)