xor_avx.h (4641B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2#ifndef _ASM_X86_XOR_AVX_H 3#define _ASM_X86_XOR_AVX_H 4 5/* 6 * Optimized RAID-5 checksumming functions for AVX 7 * 8 * Copyright (C) 2012 Intel Corporation 9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 10 * 11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 12 */ 13 14#include <linux/compiler.h> 15#include <asm/fpu/api.h> 16 17#define BLOCK4(i) \ 18 BLOCK(32 * i, 0) \ 19 BLOCK(32 * (i + 1), 1) \ 20 BLOCK(32 * (i + 2), 2) \ 21 BLOCK(32 * (i + 3), 3) 22 23#define BLOCK16() \ 24 BLOCK4(0) \ 25 BLOCK4(4) \ 26 BLOCK4(8) \ 27 BLOCK4(12) 28 29static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, 30 const unsigned long * __restrict p1) 31{ 32 unsigned long lines = bytes >> 9; 33 34 kernel_fpu_begin(); 35 36 while (lines--) { 37#undef BLOCK 38#define BLOCK(i, reg) \ 39do { \ 40 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 41 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 42 "m" (p0[i / sizeof(*p0)])); \ 43 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 44 "=m" (p0[i / sizeof(*p0)])); \ 45} while (0); 46 47 BLOCK16() 48 49 p0 = (unsigned long *)((uintptr_t)p0 + 512); 50 p1 = (unsigned long *)((uintptr_t)p1 + 512); 51 } 52 53 kernel_fpu_end(); 54} 55 56static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, 57 const unsigned long * __restrict p1, 58 const unsigned long * __restrict p2) 59{ 60 unsigned long lines = bytes >> 9; 61 62 kernel_fpu_begin(); 63 64 while (lines--) { 65#undef BLOCK 66#define BLOCK(i, reg) \ 67do { \ 68 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 69 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 70 "m" (p1[i / sizeof(*p1)])); \ 71 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 72 "m" (p0[i / sizeof(*p0)])); \ 73 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 74 "=m" (p0[i / sizeof(*p0)])); \ 75} while (0); 76 77 BLOCK16() 78 79 p0 = (unsigned long *)((uintptr_t)p0 + 512); 80 p1 = (unsigned long *)((uintptr_t)p1 + 512); 81 p2 = (unsigned long *)((uintptr_t)p2 + 512); 82 } 83 84 kernel_fpu_end(); 85} 86 87static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, 88 const unsigned long * __restrict p1, 89 const unsigned long * __restrict p2, 90 const unsigned long * __restrict p3) 91{ 92 unsigned long lines = bytes >> 9; 93 94 kernel_fpu_begin(); 95 96 while (lines--) { 97#undef BLOCK 98#define BLOCK(i, reg) \ 99do { \ 100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102 "m" (p2[i / sizeof(*p2)])); \ 103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104 "m" (p1[i / sizeof(*p1)])); \ 105 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 106 "m" (p0[i / sizeof(*p0)])); \ 107 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 108 "=m" (p0[i / sizeof(*p0)])); \ 109} while (0); 110 111 BLOCK16(); 112 113 p0 = (unsigned long *)((uintptr_t)p0 + 512); 114 p1 = (unsigned long *)((uintptr_t)p1 + 512); 115 p2 = (unsigned long *)((uintptr_t)p2 + 512); 116 p3 = (unsigned long *)((uintptr_t)p3 + 512); 117 } 118 119 kernel_fpu_end(); 120} 121 122static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, 123 const unsigned long * __restrict p1, 124 const unsigned long * __restrict p2, 125 const unsigned long * __restrict p3, 126 const unsigned long * __restrict p4) 127{ 128 unsigned long lines = bytes >> 9; 129 130 kernel_fpu_begin(); 131 132 while (lines--) { 133#undef BLOCK 134#define BLOCK(i, reg) \ 135do { \ 136 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 137 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 138 "m" (p3[i / sizeof(*p3)])); \ 139 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 140 "m" (p2[i / sizeof(*p2)])); \ 141 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 142 "m" (p1[i / sizeof(*p1)])); \ 143 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 144 "m" (p0[i / sizeof(*p0)])); \ 145 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 146 "=m" (p0[i / sizeof(*p0)])); \ 147} while (0); 148 149 BLOCK16() 150 151 p0 = (unsigned long *)((uintptr_t)p0 + 512); 152 p1 = (unsigned long *)((uintptr_t)p1 + 512); 153 p2 = (unsigned long *)((uintptr_t)p2 + 512); 154 p3 = (unsigned long *)((uintptr_t)p3 + 512); 155 p4 = (unsigned long *)((uintptr_t)p4 + 512); 156 } 157 158 kernel_fpu_end(); 159} 160 161static struct xor_block_template xor_block_avx = { 162 .name = "avx", 163 .do_2 = xor_avx_2, 164 .do_3 = xor_avx_3, 165 .do_4 = xor_avx_4, 166 .do_5 = xor_avx_5, 167}; 168 169#define AVX_XOR_SPEED \ 170do { \ 171 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ 172 xor_speed(&xor_block_avx); \ 173} while (0) 174 175#define AVX_SELECT(FASTEST) \ 176 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) 177 178#endif