xor.h (10640B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2#ifndef _ASM_X86_XOR_H 3#define _ASM_X86_XOR_H 4 5/* 6 * Optimized RAID-5 checksumming functions for SSE. 7 */ 8 9/* 10 * Cache avoiding checksumming functions utilizing KNI instructions 11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 12 */ 13 14/* 15 * Based on 16 * High-speed RAID5 checksumming functions utilizing SSE instructions. 17 * Copyright (C) 1998 Ingo Molnar. 18 */ 19 20/* 21 * x86-64 changes / gcc fixes from Andi Kleen. 22 * Copyright 2002 Andi Kleen, SuSE Labs. 23 * 24 * This hasn't been optimized for the hammer yet, but there are likely 25 * no advantages to be gotten from x86-64 here anyways. 26 */ 27 28#include <asm/fpu/api.h> 29 30#ifdef CONFIG_X86_32 31/* reduce register pressure */ 32# define XOR_CONSTANT_CONSTRAINT "i" 33#else 34# define XOR_CONSTANT_CONSTRAINT "re" 35#endif 36 37#define OFFS(x) "16*("#x")" 38#define PF_OFFS(x) "256+16*("#x")" 39#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 40#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 41#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 42#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 43#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 44#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 45#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 46#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 47#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 48#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 49#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 50#define NOP(x) 51 52#define BLK64(pf, op, i) \ 53 pf(i) \ 54 op(i, 0) \ 55 op(i + 1, 1) \ 56 op(i + 2, 2) \ 57 op(i + 3, 3) 58 59static void 60xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, 61 const unsigned long * __restrict p2) 62{ 63 unsigned long lines = bytes >> 8; 64 65 kernel_fpu_begin(); 66 67 asm volatile( 68#undef BLOCK 69#define BLOCK(i) \ 70 LD(i, 0) \ 71 LD(i + 1, 1) \ 72 PF1(i) \ 73 PF1(i + 2) \ 74 LD(i + 2, 2) \ 75 LD(i + 3, 3) \ 76 PF0(i + 4) \ 77 PF0(i + 6) \ 78 XO1(i, 0) \ 79 XO1(i + 1, 1) \ 80 XO1(i + 2, 2) \ 81 XO1(i + 3, 3) \ 82 ST(i, 0) \ 83 ST(i + 1, 1) \ 84 ST(i + 2, 2) \ 85 ST(i + 3, 3) \ 86 87 88 PF0(0) 89 PF0(2) 90 91 " .align 32 ;\n" 92 " 1: ;\n" 93 94 BLOCK(0) 95 BLOCK(4) 96 BLOCK(8) 97 BLOCK(12) 98 99 " add %[inc], %[p1] ;\n" 100 " add %[inc], %[p2] ;\n" 101 " dec %[cnt] ;\n" 102 " jnz 1b ;\n" 103 : [cnt] "+r" (lines), 104 [p1] "+r" (p1), [p2] "+r" (p2) 105 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 106 : "memory"); 107 108 kernel_fpu_end(); 109} 110 111static void 112xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, 113 const unsigned long * __restrict p2) 114{ 115 unsigned long lines = bytes >> 8; 116 117 kernel_fpu_begin(); 118 119 asm volatile( 120#undef BLOCK 121#define BLOCK(i) \ 122 BLK64(PF0, LD, i) \ 123 BLK64(PF1, XO1, i) \ 124 BLK64(NOP, ST, i) \ 125 126 " .align 32 ;\n" 127 " 1: ;\n" 128 129 BLOCK(0) 130 BLOCK(4) 131 BLOCK(8) 132 BLOCK(12) 133 134 " add %[inc], %[p1] ;\n" 135 " add %[inc], %[p2] ;\n" 136 " dec %[cnt] ;\n" 137 " jnz 1b ;\n" 138 : [cnt] "+r" (lines), 139 [p1] "+r" (p1), [p2] "+r" (p2) 140 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 141 : "memory"); 142 143 kernel_fpu_end(); 144} 145 146static void 147xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, 148 const unsigned long * __restrict p2, 149 const unsigned long * __restrict p3) 150{ 151 unsigned long lines = bytes >> 8; 152 153 kernel_fpu_begin(); 154 155 asm volatile( 156#undef BLOCK 157#define BLOCK(i) \ 158 PF1(i) \ 159 PF1(i + 2) \ 160 LD(i, 0) \ 161 LD(i + 1, 1) \ 162 LD(i + 2, 2) \ 163 LD(i + 3, 3) \ 164 PF2(i) \ 165 PF2(i + 2) \ 166 PF0(i + 4) \ 167 PF0(i + 6) \ 168 XO1(i, 0) \ 169 XO1(i + 1, 1) \ 170 XO1(i + 2, 2) \ 171 XO1(i + 3, 3) \ 172 XO2(i, 0) \ 173 XO2(i + 1, 1) \ 174 XO2(i + 2, 2) \ 175 XO2(i + 3, 3) \ 176 ST(i, 0) \ 177 ST(i + 1, 1) \ 178 ST(i + 2, 2) \ 179 ST(i + 3, 3) \ 180 181 182 PF0(0) 183 PF0(2) 184 185 " .align 32 ;\n" 186 " 1: ;\n" 187 188 BLOCK(0) 189 BLOCK(4) 190 BLOCK(8) 191 BLOCK(12) 192 193 " add %[inc], %[p1] ;\n" 194 " add %[inc], %[p2] ;\n" 195 " add %[inc], %[p3] ;\n" 196 " dec %[cnt] ;\n" 197 " jnz 1b ;\n" 198 : [cnt] "+r" (lines), 199 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 200 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 201 : "memory"); 202 203 kernel_fpu_end(); 204} 205 206static void 207xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, 208 const unsigned long * __restrict p2, 209 const unsigned long * __restrict p3) 210{ 211 unsigned long lines = bytes >> 8; 212 213 kernel_fpu_begin(); 214 215 asm volatile( 216#undef BLOCK 217#define BLOCK(i) \ 218 BLK64(PF0, LD, i) \ 219 BLK64(PF1, XO1, i) \ 220 BLK64(PF2, XO2, i) \ 221 BLK64(NOP, ST, i) \ 222 223 " .align 32 ;\n" 224 " 1: ;\n" 225 226 BLOCK(0) 227 BLOCK(4) 228 BLOCK(8) 229 BLOCK(12) 230 231 " add %[inc], %[p1] ;\n" 232 " add %[inc], %[p2] ;\n" 233 " add %[inc], %[p3] ;\n" 234 " dec %[cnt] ;\n" 235 " jnz 1b ;\n" 236 : [cnt] "+r" (lines), 237 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 238 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 239 : "memory"); 240 241 kernel_fpu_end(); 242} 243 244static void 245xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, 246 const unsigned long * __restrict p2, 247 const unsigned long * __restrict p3, 248 const unsigned long * __restrict p4) 249{ 250 unsigned long lines = bytes >> 8; 251 252 kernel_fpu_begin(); 253 254 asm volatile( 255#undef BLOCK 256#define BLOCK(i) \ 257 PF1(i) \ 258 PF1(i + 2) \ 259 LD(i, 0) \ 260 LD(i + 1, 1) \ 261 LD(i + 2, 2) \ 262 LD(i + 3, 3) \ 263 PF2(i) \ 264 PF2(i + 2) \ 265 XO1(i, 0) \ 266 XO1(i + 1, 1) \ 267 XO1(i + 2, 2) \ 268 XO1(i + 3, 3) \ 269 PF3(i) \ 270 PF3(i + 2) \ 271 PF0(i + 4) \ 272 PF0(i + 6) \ 273 XO2(i, 0) \ 274 XO2(i + 1, 1) \ 275 XO2(i + 2, 2) \ 276 XO2(i + 3, 3) \ 277 XO3(i, 0) \ 278 XO3(i + 1, 1) \ 279 XO3(i + 2, 2) \ 280 XO3(i + 3, 3) \ 281 ST(i, 0) \ 282 ST(i + 1, 1) \ 283 ST(i + 2, 2) \ 284 ST(i + 3, 3) \ 285 286 287 PF0(0) 288 PF0(2) 289 290 " .align 32 ;\n" 291 " 1: ;\n" 292 293 BLOCK(0) 294 BLOCK(4) 295 BLOCK(8) 296 BLOCK(12) 297 298 " add %[inc], %[p1] ;\n" 299 " add %[inc], %[p2] ;\n" 300 " add %[inc], %[p3] ;\n" 301 " add %[inc], %[p4] ;\n" 302 " dec %[cnt] ;\n" 303 " jnz 1b ;\n" 304 : [cnt] "+r" (lines), [p1] "+r" (p1), 305 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 306 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 307 : "memory"); 308 309 kernel_fpu_end(); 310} 311 312static void 313xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, 314 const unsigned long * __restrict p2, 315 const unsigned long * __restrict p3, 316 const unsigned long * __restrict p4) 317{ 318 unsigned long lines = bytes >> 8; 319 320 kernel_fpu_begin(); 321 322 asm volatile( 323#undef BLOCK 324#define BLOCK(i) \ 325 BLK64(PF0, LD, i) \ 326 BLK64(PF1, XO1, i) \ 327 BLK64(PF2, XO2, i) \ 328 BLK64(PF3, XO3, i) \ 329 BLK64(NOP, ST, i) \ 330 331 " .align 32 ;\n" 332 " 1: ;\n" 333 334 BLOCK(0) 335 BLOCK(4) 336 BLOCK(8) 337 BLOCK(12) 338 339 " add %[inc], %[p1] ;\n" 340 " add %[inc], %[p2] ;\n" 341 " add %[inc], %[p3] ;\n" 342 " add %[inc], %[p4] ;\n" 343 " dec %[cnt] ;\n" 344 " jnz 1b ;\n" 345 : [cnt] "+r" (lines), [p1] "+r" (p1), 346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 348 : "memory"); 349 350 kernel_fpu_end(); 351} 352 353static void 354xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, 355 const unsigned long * __restrict p2, 356 const unsigned long * __restrict p3, 357 const unsigned long * __restrict p4, 358 const unsigned long * __restrict p5) 359{ 360 unsigned long lines = bytes >> 8; 361 362 kernel_fpu_begin(); 363 364 asm volatile( 365#undef BLOCK 366#define BLOCK(i) \ 367 PF1(i) \ 368 PF1(i + 2) \ 369 LD(i, 0) \ 370 LD(i + 1, 1) \ 371 LD(i + 2, 2) \ 372 LD(i + 3, 3) \ 373 PF2(i) \ 374 PF2(i + 2) \ 375 XO1(i, 0) \ 376 XO1(i + 1, 1) \ 377 XO1(i + 2, 2) \ 378 XO1(i + 3, 3) \ 379 PF3(i) \ 380 PF3(i + 2) \ 381 XO2(i, 0) \ 382 XO2(i + 1, 1) \ 383 XO2(i + 2, 2) \ 384 XO2(i + 3, 3) \ 385 PF4(i) \ 386 PF4(i + 2) \ 387 PF0(i + 4) \ 388 PF0(i + 6) \ 389 XO3(i, 0) \ 390 XO3(i + 1, 1) \ 391 XO3(i + 2, 2) \ 392 XO3(i + 3, 3) \ 393 XO4(i, 0) \ 394 XO4(i + 1, 1) \ 395 XO4(i + 2, 2) \ 396 XO4(i + 3, 3) \ 397 ST(i, 0) \ 398 ST(i + 1, 1) \ 399 ST(i + 2, 2) \ 400 ST(i + 3, 3) \ 401 402 403 PF0(0) 404 PF0(2) 405 406 " .align 32 ;\n" 407 " 1: ;\n" 408 409 BLOCK(0) 410 BLOCK(4) 411 BLOCK(8) 412 BLOCK(12) 413 414 " add %[inc], %[p1] ;\n" 415 " add %[inc], %[p2] ;\n" 416 " add %[inc], %[p3] ;\n" 417 " add %[inc], %[p4] ;\n" 418 " add %[inc], %[p5] ;\n" 419 " dec %[cnt] ;\n" 420 " jnz 1b ;\n" 421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 424 : "memory"); 425 426 kernel_fpu_end(); 427} 428 429static void 430xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, 431 const unsigned long * __restrict p2, 432 const unsigned long * __restrict p3, 433 const unsigned long * __restrict p4, 434 const unsigned long * __restrict p5) 435{ 436 unsigned long lines = bytes >> 8; 437 438 kernel_fpu_begin(); 439 440 asm volatile( 441#undef BLOCK 442#define BLOCK(i) \ 443 BLK64(PF0, LD, i) \ 444 BLK64(PF1, XO1, i) \ 445 BLK64(PF2, XO2, i) \ 446 BLK64(PF3, XO3, i) \ 447 BLK64(PF4, XO4, i) \ 448 BLK64(NOP, ST, i) \ 449 450 " .align 32 ;\n" 451 " 1: ;\n" 452 453 BLOCK(0) 454 BLOCK(4) 455 BLOCK(8) 456 BLOCK(12) 457 458 " add %[inc], %[p1] ;\n" 459 " add %[inc], %[p2] ;\n" 460 " add %[inc], %[p3] ;\n" 461 " add %[inc], %[p4] ;\n" 462 " add %[inc], %[p5] ;\n" 463 " dec %[cnt] ;\n" 464 " jnz 1b ;\n" 465 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 466 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 467 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 468 : "memory"); 469 470 kernel_fpu_end(); 471} 472 473static struct xor_block_template xor_block_sse_pf64 = { 474 .name = "prefetch64-sse", 475 .do_2 = xor_sse_2_pf64, 476 .do_3 = xor_sse_3_pf64, 477 .do_4 = xor_sse_4_pf64, 478 .do_5 = xor_sse_5_pf64, 479}; 480 481#undef LD 482#undef XO1 483#undef XO2 484#undef XO3 485#undef XO4 486#undef ST 487#undef NOP 488#undef BLK64 489#undef BLOCK 490 491#undef XOR_CONSTANT_CONSTRAINT 492 493#ifdef CONFIG_X86_32 494# include <asm/xor_32.h> 495#else 496# include <asm/xor_64.h> 497#endif 498 499#define XOR_SELECT_TEMPLATE(FASTEST) \ 500 AVX_SELECT(FASTEST) 501 502#endif /* _ASM_X86_XOR_H */