avx512.c (18244B)
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* -*- linux-c -*- -------------------------------------------------------- 3 * 4 * Copyright (C) 2016 Intel Corporation 5 * 6 * Author: Gayatri Kammela <gayatri.kammela@intel.com> 7 * Author: Megha Dey <megha.dey@linux.intel.com> 8 * 9 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved 10 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 11 * 12 * ----------------------------------------------------------------------- 13 */ 14 15/* 16 * AVX512 implementation of RAID-6 syndrome functions 17 * 18 */ 19 20#ifdef CONFIG_AS_AVX512 21 22#include <linux/raid/pq.h> 23#include "x86.h" 24 25static const struct raid6_avx512_constants { 26 u64 x1d[8]; 27} raid6_avx512_constants __aligned(512/8) = { 28 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 29 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 30 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 31 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 32}; 33 34static int raid6_have_avx512(void) 35{ 36 return boot_cpu_has(X86_FEATURE_AVX2) && 37 boot_cpu_has(X86_FEATURE_AVX) && 38 boot_cpu_has(X86_FEATURE_AVX512F) && 39 boot_cpu_has(X86_FEATURE_AVX512BW) && 40 boot_cpu_has(X86_FEATURE_AVX512VL) && 41 boot_cpu_has(X86_FEATURE_AVX512DQ); 42} 43 44static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) 45{ 46 u8 **dptr = (u8 **)ptrs; 47 u8 *p, *q; 48 int d, z, z0; 49 50 z0 = disks - 3; /* Highest data disk */ 51 p = dptr[z0+1]; /* XOR parity */ 52 q = dptr[z0+2]; /* RS syndrome */ 53 54 kernel_fpu_begin(); 55 56 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 57 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 58 : 59 : "m" (raid6_avx512_constants.x1d[0])); 60 61 for (d = 0; d < bytes; d += 64) { 62 asm volatile("prefetchnta %0\n\t" 63 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 64 "prefetchnta %1\n\t" 65 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 66 "vmovdqa64 %1,%%zmm6" 67 : 68 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); 69 for (z = z0-2; z >= 0; z--) { 70 asm volatile("prefetchnta %0\n\t" 71 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 72 "vpmovm2b %%k1,%%zmm5\n\t" 73 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 74 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 75 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 76 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 77 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 78 "vmovdqa64 %0,%%zmm6" 79 : 80 : "m" (dptr[z][d])); 81 } 82 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 83 "vpmovm2b %%k1,%%zmm5\n\t" 84 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 85 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 86 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 87 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 88 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 89 "vmovntdq %%zmm2,%0\n\t" 90 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 91 "vmovntdq %%zmm4,%1\n\t" 92 "vpxorq %%zmm4,%%zmm4,%%zmm4" 93 : 94 : "m" (p[d]), "m" (q[d])); 95 } 96 97 asm volatile("sfence" : : : "memory"); 98 kernel_fpu_end(); 99} 100 101static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, 102 size_t bytes, void **ptrs) 103{ 104 u8 **dptr = (u8 **)ptrs; 105 u8 *p, *q; 106 int d, z, z0; 107 108 z0 = stop; /* P/Q right side optimization */ 109 p = dptr[disks-2]; /* XOR parity */ 110 q = dptr[disks-1]; /* RS syndrome */ 111 112 kernel_fpu_begin(); 113 114 asm volatile("vmovdqa64 %0,%%zmm0" 115 : : "m" (raid6_avx512_constants.x1d[0])); 116 117 for (d = 0 ; d < bytes ; d += 64) { 118 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 119 "vmovdqa64 %1,%%zmm2\n\t" 120 "vpxorq %%zmm4,%%zmm2,%%zmm2" 121 : 122 : "m" (dptr[z0][d]), "m" (p[d])); 123 /* P/Q data pages */ 124 for (z = z0-1 ; z >= start ; z--) { 125 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 126 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 127 "vpmovm2b %%k1,%%zmm5\n\t" 128 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 129 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 130 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 131 "vmovdqa64 %0,%%zmm5\n\t" 132 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 133 "vpxorq %%zmm5,%%zmm4,%%zmm4" 134 : 135 : "m" (dptr[z][d])); 136 } 137 /* P/Q left side optimization */ 138 for (z = start-1 ; z >= 0 ; z--) { 139 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 140 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 141 "vpmovm2b %%k1,%%zmm5\n\t" 142 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 143 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 144 "vpxorq %%zmm5,%%zmm4,%%zmm4" 145 : 146 : ); 147 } 148 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 149 /* Don't use movntdq for r/w memory area < cache line */ 150 "vmovdqa64 %%zmm4,%0\n\t" 151 "vmovdqa64 %%zmm2,%1" 152 : 153 : "m" (q[d]), "m" (p[d])); 154 } 155 156 asm volatile("sfence" : : : "memory"); 157 kernel_fpu_end(); 158} 159 160const struct raid6_calls raid6_avx512x1 = { 161 raid6_avx5121_gen_syndrome, 162 raid6_avx5121_xor_syndrome, 163 raid6_have_avx512, 164 "avx512x1", 165 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ 166}; 167 168/* 169 * Unrolled-by-2 AVX512 implementation 170 */ 171static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) 172{ 173 u8 **dptr = (u8 **)ptrs; 174 u8 *p, *q; 175 int d, z, z0; 176 177 z0 = disks - 3; /* Highest data disk */ 178 p = dptr[z0+1]; /* XOR parity */ 179 q = dptr[z0+2]; /* RS syndrome */ 180 181 kernel_fpu_begin(); 182 183 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 184 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 185 : 186 : "m" (raid6_avx512_constants.x1d[0])); 187 188 /* We uniformly assume a single prefetch covers at least 64 bytes */ 189 for (d = 0; d < bytes; d += 128) { 190 asm volatile("prefetchnta %0\n\t" 191 "prefetchnta %1\n\t" 192 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 193 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ 194 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 195 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ 196 : 197 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); 198 for (z = z0-1; z >= 0; z--) { 199 asm volatile("prefetchnta %0\n\t" 200 "prefetchnta %1\n\t" 201 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 202 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 203 "vpmovm2b %%k1,%%zmm5\n\t" 204 "vpmovm2b %%k2,%%zmm7\n\t" 205 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 206 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 207 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 208 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 209 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 210 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 211 "vmovdqa64 %0,%%zmm5\n\t" 212 "vmovdqa64 %1,%%zmm7\n\t" 213 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 214 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 215 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 216 "vpxorq %%zmm7,%%zmm6,%%zmm6" 217 : 218 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 219 } 220 asm volatile("vmovntdq %%zmm2,%0\n\t" 221 "vmovntdq %%zmm3,%1\n\t" 222 "vmovntdq %%zmm4,%2\n\t" 223 "vmovntdq %%zmm6,%3" 224 : 225 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), 226 "m" (q[d+64])); 227 } 228 229 asm volatile("sfence" : : : "memory"); 230 kernel_fpu_end(); 231} 232 233static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, 234 size_t bytes, void **ptrs) 235{ 236 u8 **dptr = (u8 **)ptrs; 237 u8 *p, *q; 238 int d, z, z0; 239 240 z0 = stop; /* P/Q right side optimization */ 241 p = dptr[disks-2]; /* XOR parity */ 242 q = dptr[disks-1]; /* RS syndrome */ 243 244 kernel_fpu_begin(); 245 246 asm volatile("vmovdqa64 %0,%%zmm0" 247 : : "m" (raid6_avx512_constants.x1d[0])); 248 249 for (d = 0 ; d < bytes ; d += 128) { 250 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 251 "vmovdqa64 %1,%%zmm6\n\t" 252 "vmovdqa64 %2,%%zmm2\n\t" 253 "vmovdqa64 %3,%%zmm3\n\t" 254 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 255 "vpxorq %%zmm6,%%zmm3,%%zmm3" 256 : 257 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 258 "m" (p[d]), "m" (p[d+64])); 259 /* P/Q data pages */ 260 for (z = z0-1 ; z >= start ; z--) { 261 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 262 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 263 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 264 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 265 "vpmovm2b %%k1,%%zmm5\n\t" 266 "vpmovm2b %%k2,%%zmm7\n\t" 267 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 268 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 269 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 270 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 271 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 272 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 273 "vmovdqa64 %0,%%zmm5\n\t" 274 "vmovdqa64 %1,%%zmm7\n\t" 275 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 276 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 277 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 278 "vpxorq %%zmm7,%%zmm6,%%zmm6" 279 : 280 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 281 } 282 /* P/Q left side optimization */ 283 for (z = start-1 ; z >= 0 ; z--) { 284 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 285 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 286 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 287 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 288 "vpmovm2b %%k1,%%zmm5\n\t" 289 "vpmovm2b %%k2,%%zmm7\n\t" 290 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 291 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 292 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 293 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 294 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 295 "vpxorq %%zmm7,%%zmm6,%%zmm6" 296 : 297 : ); 298 } 299 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 300 "vpxorq %1,%%zmm6,%%zmm6\n\t" 301 /* Don't use movntdq for r/w 302 * memory area < cache line 303 */ 304 "vmovdqa64 %%zmm4,%0\n\t" 305 "vmovdqa64 %%zmm6,%1\n\t" 306 "vmovdqa64 %%zmm2,%2\n\t" 307 "vmovdqa64 %%zmm3,%3" 308 : 309 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), 310 "m" (p[d+64])); 311 } 312 313 asm volatile("sfence" : : : "memory"); 314 kernel_fpu_end(); 315} 316 317const struct raid6_calls raid6_avx512x2 = { 318 raid6_avx5122_gen_syndrome, 319 raid6_avx5122_xor_syndrome, 320 raid6_have_avx512, 321 "avx512x2", 322 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ 323}; 324 325#ifdef CONFIG_X86_64 326 327/* 328 * Unrolled-by-4 AVX2 implementation 329 */ 330static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) 331{ 332 u8 **dptr = (u8 **)ptrs; 333 u8 *p, *q; 334 int d, z, z0; 335 336 z0 = disks - 3; /* Highest data disk */ 337 p = dptr[z0+1]; /* XOR parity */ 338 q = dptr[z0+2]; /* RS syndrome */ 339 340 kernel_fpu_begin(); 341 342 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 343 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ 344 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ 345 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ 346 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ 347 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ 348 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ 349 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ 350 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ 351 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ 352 : 353 : "m" (raid6_avx512_constants.x1d[0])); 354 355 for (d = 0; d < bytes; d += 256) { 356 for (z = z0; z >= 0; z--) { 357 asm volatile("prefetchnta %0\n\t" 358 "prefetchnta %1\n\t" 359 "prefetchnta %2\n\t" 360 "prefetchnta %3\n\t" 361 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 362 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 363 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" 364 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" 365 "vpmovm2b %%k1,%%zmm5\n\t" 366 "vpmovm2b %%k2,%%zmm7\n\t" 367 "vpmovm2b %%k3,%%zmm13\n\t" 368 "vpmovm2b %%k4,%%zmm15\n\t" 369 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 370 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 371 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 372 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 373 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 374 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 375 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 376 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 377 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 378 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 379 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 380 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 381 "vmovdqa64 %0,%%zmm5\n\t" 382 "vmovdqa64 %1,%%zmm7\n\t" 383 "vmovdqa64 %2,%%zmm13\n\t" 384 "vmovdqa64 %3,%%zmm15\n\t" 385 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 386 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 387 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 388 "vpxorq %%zmm15,%%zmm11,%%zmm11\n" 389 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 390 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 391 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 392 "vpxorq %%zmm15,%%zmm14,%%zmm14" 393 : 394 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 395 "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); 396 } 397 asm volatile("vmovntdq %%zmm2,%0\n\t" 398 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 399 "vmovntdq %%zmm3,%1\n\t" 400 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" 401 "vmovntdq %%zmm10,%2\n\t" 402 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" 403 "vmovntdq %%zmm11,%3\n\t" 404 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" 405 "vmovntdq %%zmm4,%4\n\t" 406 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" 407 "vmovntdq %%zmm6,%5\n\t" 408 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" 409 "vmovntdq %%zmm12,%6\n\t" 410 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" 411 "vmovntdq %%zmm14,%7\n\t" 412 "vpxorq %%zmm14,%%zmm14,%%zmm14" 413 : 414 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 415 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 416 "m" (q[d+128]), "m" (q[d+192])); 417 } 418 419 asm volatile("sfence" : : : "memory"); 420 kernel_fpu_end(); 421} 422 423static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, 424 size_t bytes, void **ptrs) 425{ 426 u8 **dptr = (u8 **)ptrs; 427 u8 *p, *q; 428 int d, z, z0; 429 430 z0 = stop; /* P/Q right side optimization */ 431 p = dptr[disks-2]; /* XOR parity */ 432 q = dptr[disks-1]; /* RS syndrome */ 433 434 kernel_fpu_begin(); 435 436 asm volatile("vmovdqa64 %0,%%zmm0" 437 :: "m" (raid6_avx512_constants.x1d[0])); 438 439 for (d = 0 ; d < bytes ; d += 256) { 440 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 441 "vmovdqa64 %1,%%zmm6\n\t" 442 "vmovdqa64 %2,%%zmm12\n\t" 443 "vmovdqa64 %3,%%zmm14\n\t" 444 "vmovdqa64 %4,%%zmm2\n\t" 445 "vmovdqa64 %5,%%zmm3\n\t" 446 "vmovdqa64 %6,%%zmm10\n\t" 447 "vmovdqa64 %7,%%zmm11\n\t" 448 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 449 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" 450 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" 451 "vpxorq %%zmm14,%%zmm11,%%zmm11" 452 : 453 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 454 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), 455 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 456 "m" (p[d+192])); 457 /* P/Q data pages */ 458 for (z = z0-1 ; z >= start ; z--) { 459 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 460 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 461 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 462 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 463 "prefetchnta %0\n\t" 464 "prefetchnta %2\n\t" 465 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 466 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 467 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 468 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 469 "vpmovm2b %%k1,%%zmm5\n\t" 470 "vpmovm2b %%k2,%%zmm7\n\t" 471 "vpmovm2b %%k3,%%zmm13\n\t" 472 "vpmovm2b %%k4,%%zmm15\n\t" 473 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 474 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 475 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 476 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" 477 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 478 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 479 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 480 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 481 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 482 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 483 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 484 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 485 "vmovdqa64 %0,%%zmm5\n\t" 486 "vmovdqa64 %1,%%zmm7\n\t" 487 "vmovdqa64 %2,%%zmm13\n\t" 488 "vmovdqa64 %3,%%zmm15\n\t" 489 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 490 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 491 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 492 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" 493 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 494 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 495 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 496 "vpxorq %%zmm15,%%zmm14,%%zmm14" 497 : 498 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 499 "m" (dptr[z][d+128]), 500 "m" (dptr[z][d+192])); 501 } 502 asm volatile("prefetchnta %0\n\t" 503 "prefetchnta %1\n\t" 504 : 505 : "m" (q[d]), "m" (q[d+128])); 506 /* P/Q left side optimization */ 507 for (z = start-1 ; z >= 0 ; z--) { 508 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 509 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 510 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 511 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 512 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 513 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 514 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 515 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 516 "vpmovm2b %%k1,%%zmm5\n\t" 517 "vpmovm2b %%k2,%%zmm7\n\t" 518 "vpmovm2b %%k3,%%zmm13\n\t" 519 "vpmovm2b %%k4,%%zmm15\n\t" 520 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 521 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 522 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 523 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 524 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 525 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 526 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 527 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 528 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 529 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 530 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 531 "vpxorq %%zmm15,%%zmm14,%%zmm14" 532 : 533 : ); 534 } 535 asm volatile("vmovntdq %%zmm2,%0\n\t" 536 "vmovntdq %%zmm3,%1\n\t" 537 "vmovntdq %%zmm10,%2\n\t" 538 "vmovntdq %%zmm11,%3\n\t" 539 "vpxorq %4,%%zmm4,%%zmm4\n\t" 540 "vpxorq %5,%%zmm6,%%zmm6\n\t" 541 "vpxorq %6,%%zmm12,%%zmm12\n\t" 542 "vpxorq %7,%%zmm14,%%zmm14\n\t" 543 "vmovntdq %%zmm4,%4\n\t" 544 "vmovntdq %%zmm6,%5\n\t" 545 "vmovntdq %%zmm12,%6\n\t" 546 "vmovntdq %%zmm14,%7" 547 : 548 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 549 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 550 "m" (q[d+128]), "m" (q[d+192])); 551 } 552 asm volatile("sfence" : : : "memory"); 553 kernel_fpu_end(); 554} 555const struct raid6_calls raid6_avx512x4 = { 556 raid6_avx5124_gen_syndrome, 557 raid6_avx5124_xor_syndrome, 558 raid6_have_avx512, 559 "avx512x4", 560 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ 561}; 562#endif 563 564#endif /* CONFIG_AS_AVX512 */