cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

avx2.c (17279B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/* -*- linux-c -*- ------------------------------------------------------- *
      3 *
      4 *   Copyright (C) 2012 Intel Corporation
      5 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
      6 *
      7 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
      8 *
      9 * ----------------------------------------------------------------------- */
     10
     11/*
     12 * AVX2 implementation of RAID-6 syndrome functions
     13 *
     14 */
     15
     16#include <linux/raid/pq.h>
     17#include "x86.h"
     18
     19static const struct raid6_avx2_constants {
     20	u64 x1d[4];
     21} raid6_avx2_constants __aligned(32) = {
     22	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
     23	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
     24};
     25
     26static int raid6_have_avx2(void)
     27{
     28	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
     29}
     30
     31/*
     32 * Plain AVX2 implementation
     33 */
     34static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
     35{
     36	u8 **dptr = (u8 **)ptrs;
     37	u8 *p, *q;
     38	int d, z, z0;
     39
     40	z0 = disks - 3;		/* Highest data disk */
     41	p = dptr[z0+1];		/* XOR parity */
     42	q = dptr[z0+2];		/* RS syndrome */
     43
     44	kernel_fpu_begin();
     45
     46	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
     47	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
     48
     49	for (d = 0; d < bytes; d += 32) {
     50		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
     51		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
     52		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
     53		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
     54		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
     55		for (z = z0-2; z >= 0; z--) {
     56			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
     57			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
     58			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
     59			asm volatile("vpand %ymm0,%ymm5,%ymm5");
     60			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
     61			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
     62			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
     63			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
     64		}
     65		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
     66		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
     67		asm volatile("vpand %ymm0,%ymm5,%ymm5");
     68		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
     69		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
     70		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
     71
     72		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
     73		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
     74		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
     75		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
     76	}
     77
     78	asm volatile("sfence" : : : "memory");
     79	kernel_fpu_end();
     80}
     81
     82static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
     83				     size_t bytes, void **ptrs)
     84{
     85	u8 **dptr = (u8 **)ptrs;
     86	u8 *p, *q;
     87	int d, z, z0;
     88
     89	z0 = stop;		/* P/Q right side optimization */
     90	p = dptr[disks-2];	/* XOR parity */
     91	q = dptr[disks-1];	/* RS syndrome */
     92
     93	kernel_fpu_begin();
     94
     95	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
     96
     97	for (d = 0 ; d < bytes ; d += 32) {
     98		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
     99		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
    100		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
    101		/* P/Q data pages */
    102		for (z = z0-1 ; z >= start ; z--) {
    103			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
    104			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
    105			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    106			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    107			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    108			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
    109			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
    110			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    111		}
    112		/* P/Q left side optimization */
    113		for (z = start-1 ; z >= 0 ; z--) {
    114			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
    115			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
    116			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    117			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    118			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    119		}
    120		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
    121		/* Don't use movntdq for r/w memory area < cache line */
    122		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
    123		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
    124	}
    125
    126	asm volatile("sfence" : : : "memory");
    127	kernel_fpu_end();
    128}
    129
    130const struct raid6_calls raid6_avx2x1 = {
    131	raid6_avx21_gen_syndrome,
    132	raid6_avx21_xor_syndrome,
    133	raid6_have_avx2,
    134	"avx2x1",
    135	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
    136};
    137
    138/*
    139 * Unrolled-by-2 AVX2 implementation
    140 */
    141static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
    142{
    143	u8 **dptr = (u8 **)ptrs;
    144	u8 *p, *q;
    145	int d, z, z0;
    146
    147	z0 = disks - 3;		/* Highest data disk */
    148	p = dptr[z0+1];		/* XOR parity */
    149	q = dptr[z0+2];		/* RS syndrome */
    150
    151	kernel_fpu_begin();
    152
    153	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
    154	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
    155
    156	/* We uniformly assume a single prefetch covers at least 32 bytes */
    157	for (d = 0; d < bytes; d += 64) {
    158		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
    159		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
    160		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
    161		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
    162		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
    163		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
    164		for (z = z0-1; z >= 0; z--) {
    165			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
    166			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
    167			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
    168			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
    169			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    170			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
    171			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    172			asm volatile("vpand %ymm0,%ymm7,%ymm7");
    173			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    174			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    175			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
    176			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
    177			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
    178			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
    179			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    180			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    181		}
    182		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
    183		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
    184		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
    185		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
    186	}
    187
    188	asm volatile("sfence" : : : "memory");
    189	kernel_fpu_end();
    190}
    191
    192static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
    193				     size_t bytes, void **ptrs)
    194{
    195	u8 **dptr = (u8 **)ptrs;
    196	u8 *p, *q;
    197	int d, z, z0;
    198
    199	z0 = stop;		/* P/Q right side optimization */
    200	p = dptr[disks-2];	/* XOR parity */
    201	q = dptr[disks-1];	/* RS syndrome */
    202
    203	kernel_fpu_begin();
    204
    205	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
    206
    207	for (d = 0 ; d < bytes ; d += 64) {
    208		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
    209		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
    210		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
    211		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
    212		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
    213		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
    214		/* P/Q data pages */
    215		for (z = z0-1 ; z >= start ; z--) {
    216			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
    217			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
    218			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
    219			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
    220			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    221			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
    222			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    223			asm volatile("vpand %ymm0,%ymm7,%ymm7");
    224			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    225			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    226			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
    227			asm volatile("vmovdqa %0,%%ymm7"
    228				     :: "m" (dptr[z][d+32]));
    229			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
    230			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
    231			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    232			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    233		}
    234		/* P/Q left side optimization */
    235		for (z = start-1 ; z >= 0 ; z--) {
    236			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
    237			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
    238			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
    239			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
    240			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    241			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
    242			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    243			asm volatile("vpand %ymm0,%ymm7,%ymm7");
    244			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    245			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    246		}
    247		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
    248		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
    249		/* Don't use movntdq for r/w memory area < cache line */
    250		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
    251		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
    252		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
    253		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
    254	}
    255
    256	asm volatile("sfence" : : : "memory");
    257	kernel_fpu_end();
    258}
    259
    260const struct raid6_calls raid6_avx2x2 = {
    261	raid6_avx22_gen_syndrome,
    262	raid6_avx22_xor_syndrome,
    263	raid6_have_avx2,
    264	"avx2x2",
    265	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
    266};
    267
    268#ifdef CONFIG_X86_64
    269
    270/*
    271 * Unrolled-by-4 AVX2 implementation
    272 */
    273static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
    274{
    275	u8 **dptr = (u8 **)ptrs;
    276	u8 *p, *q;
    277	int d, z, z0;
    278
    279	z0 = disks - 3;		/* Highest data disk */
    280	p = dptr[z0+1];		/* XOR parity */
    281	q = dptr[z0+2];		/* RS syndrome */
    282
    283	kernel_fpu_begin();
    284
    285	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
    286	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
    287	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
    288	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
    289	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
    290	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
    291	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
    292	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
    293	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
    294	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
    295
    296	for (d = 0; d < bytes; d += 128) {
    297		for (z = z0; z >= 0; z--) {
    298			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
    299			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
    300			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
    301			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
    302			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
    303			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
    304			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
    305			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
    306			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    307			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
    308			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
    309			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
    310			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    311			asm volatile("vpand %ymm0,%ymm7,%ymm7");
    312			asm volatile("vpand %ymm0,%ymm13,%ymm13");
    313			asm volatile("vpand %ymm0,%ymm15,%ymm15");
    314			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    315			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    316			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
    317			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
    318			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
    319			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
    320			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
    321			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
    322			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
    323			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
    324			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
    325			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
    326			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    327			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    328			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
    329			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
    330		}
    331		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
    332		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
    333		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
    334		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
    335		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
    336		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
    337		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
    338		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
    339		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
    340		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
    341		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
    342		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
    343		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
    344		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
    345		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
    346		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
    347	}
    348
    349	asm volatile("sfence" : : : "memory");
    350	kernel_fpu_end();
    351}
    352
    353static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
    354				     size_t bytes, void **ptrs)
    355{
    356	u8 **dptr = (u8 **)ptrs;
    357	u8 *p, *q;
    358	int d, z, z0;
    359
    360	z0 = stop;		/* P/Q right side optimization */
    361	p = dptr[disks-2];	/* XOR parity */
    362	q = dptr[disks-1];	/* RS syndrome */
    363
    364	kernel_fpu_begin();
    365
    366	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
    367
    368	for (d = 0 ; d < bytes ; d += 128) {
    369		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
    370		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
    371		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
    372		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
    373		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
    374		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
    375		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
    376		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
    377		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
    378		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
    379		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
    380		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
    381		/* P/Q data pages */
    382		for (z = z0-1 ; z >= start ; z--) {
    383			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
    384			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
    385			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
    386			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
    387			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
    388			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
    389			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
    390			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
    391			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
    392			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
    393			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    394			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
    395			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
    396			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
    397			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    398			asm volatile("vpand %ymm0,%ymm7,%ymm7");
    399			asm volatile("vpand %ymm0,%ymm13,%ymm13");
    400			asm volatile("vpand %ymm0,%ymm15,%ymm15");
    401			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    402			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    403			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
    404			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
    405			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
    406			asm volatile("vmovdqa %0,%%ymm7"
    407				     :: "m" (dptr[z][d+32]));
    408			asm volatile("vmovdqa %0,%%ymm13"
    409				     :: "m" (dptr[z][d+64]));
    410			asm volatile("vmovdqa %0,%%ymm15"
    411				     :: "m" (dptr[z][d+96]));
    412			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
    413			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
    414			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
    415			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
    416			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    417			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    418			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
    419			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
    420		}
    421		asm volatile("prefetchnta %0" :: "m" (q[d]));
    422		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
    423		/* P/Q left side optimization */
    424		for (z = start-1 ; z >= 0 ; z--) {
    425			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
    426			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
    427			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
    428			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
    429			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
    430			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
    431			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
    432			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
    433			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
    434			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
    435			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
    436			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
    437			asm volatile("vpand %ymm0,%ymm5,%ymm5");
    438			asm volatile("vpand %ymm0,%ymm7,%ymm7");
    439			asm volatile("vpand %ymm0,%ymm13,%ymm13");
    440			asm volatile("vpand %ymm0,%ymm15,%ymm15");
    441			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
    442			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
    443			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
    444			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
    445		}
    446		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
    447		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
    448		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
    449		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
    450		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
    451		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
    452		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
    453		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
    454		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
    455		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
    456		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
    457		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
    458	}
    459	asm volatile("sfence" : : : "memory");
    460	kernel_fpu_end();
    461}
    462
    463const struct raid6_calls raid6_avx2x4 = {
    464	raid6_avx24_gen_syndrome,
    465	raid6_avx24_xor_syndrome,
    466	raid6_have_avx2,
    467	"avx2x4",
    468	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
    469};
    470#endif /* CONFIG_X86_64 */