cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xor.h (10640B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2#ifndef _ASM_X86_XOR_H
      3#define _ASM_X86_XOR_H
      4
      5/*
      6 * Optimized RAID-5 checksumming functions for SSE.
      7 */
      8
      9/*
     10 * Cache avoiding checksumming functions utilizing KNI instructions
     11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
     12 */
     13
     14/*
     15 * Based on
     16 * High-speed RAID5 checksumming functions utilizing SSE instructions.
     17 * Copyright (C) 1998 Ingo Molnar.
     18 */
     19
     20/*
     21 * x86-64 changes / gcc fixes from Andi Kleen.
     22 * Copyright 2002 Andi Kleen, SuSE Labs.
     23 *
     24 * This hasn't been optimized for the hammer yet, but there are likely
     25 * no advantages to be gotten from x86-64 here anyways.
     26 */
     27
     28#include <asm/fpu/api.h>
     29
     30#ifdef CONFIG_X86_32
     31/* reduce register pressure */
     32# define XOR_CONSTANT_CONSTRAINT "i"
     33#else
     34# define XOR_CONSTANT_CONSTRAINT "re"
     35#endif
     36
     37#define OFFS(x)		"16*("#x")"
     38#define PF_OFFS(x)	"256+16*("#x")"
     39#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
     40#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
     41#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
     42#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
     43#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
     44#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
     45#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
     46#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
     47#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
     48#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
     49#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
     50#define NOP(x)
     51
     52#define BLK64(pf, op, i)				\
     53		pf(i)					\
     54		op(i, 0)				\
     55			op(i + 1, 1)			\
     56				op(i + 2, 2)		\
     57					op(i + 3, 3)
     58
     59static void
     60xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
     61	  const unsigned long * __restrict p2)
     62{
     63	unsigned long lines = bytes >> 8;
     64
     65	kernel_fpu_begin();
     66
     67	asm volatile(
     68#undef BLOCK
     69#define BLOCK(i)					\
     70		LD(i, 0)				\
     71			LD(i + 1, 1)			\
     72		PF1(i)					\
     73				PF1(i + 2)		\
     74				LD(i + 2, 2)		\
     75					LD(i + 3, 3)	\
     76		PF0(i + 4)				\
     77				PF0(i + 6)		\
     78		XO1(i, 0)				\
     79			XO1(i + 1, 1)			\
     80				XO1(i + 2, 2)		\
     81					XO1(i + 3, 3)	\
     82		ST(i, 0)				\
     83			ST(i + 1, 1)			\
     84				ST(i + 2, 2)		\
     85					ST(i + 3, 3)	\
     86
     87
     88		PF0(0)
     89				PF0(2)
     90
     91	" .align 32			;\n"
     92	" 1:                            ;\n"
     93
     94		BLOCK(0)
     95		BLOCK(4)
     96		BLOCK(8)
     97		BLOCK(12)
     98
     99	"       add %[inc], %[p1]       ;\n"
    100	"       add %[inc], %[p2]       ;\n"
    101	"       dec %[cnt]              ;\n"
    102	"       jnz 1b                  ;\n"
    103	: [cnt] "+r" (lines),
    104	  [p1] "+r" (p1), [p2] "+r" (p2)
    105	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    106	: "memory");
    107
    108	kernel_fpu_end();
    109}
    110
    111static void
    112xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
    113	       const unsigned long * __restrict p2)
    114{
    115	unsigned long lines = bytes >> 8;
    116
    117	kernel_fpu_begin();
    118
    119	asm volatile(
    120#undef BLOCK
    121#define BLOCK(i)			\
    122		BLK64(PF0, LD, i)	\
    123		BLK64(PF1, XO1, i)	\
    124		BLK64(NOP, ST, i)	\
    125
    126	" .align 32			;\n"
    127	" 1:                            ;\n"
    128
    129		BLOCK(0)
    130		BLOCK(4)
    131		BLOCK(8)
    132		BLOCK(12)
    133
    134	"       add %[inc], %[p1]       ;\n"
    135	"       add %[inc], %[p2]       ;\n"
    136	"       dec %[cnt]              ;\n"
    137	"       jnz 1b                  ;\n"
    138	: [cnt] "+r" (lines),
    139	  [p1] "+r" (p1), [p2] "+r" (p2)
    140	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    141	: "memory");
    142
    143	kernel_fpu_end();
    144}
    145
    146static void
    147xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
    148	  const unsigned long * __restrict p2,
    149	  const unsigned long * __restrict p3)
    150{
    151	unsigned long lines = bytes >> 8;
    152
    153	kernel_fpu_begin();
    154
    155	asm volatile(
    156#undef BLOCK
    157#define BLOCK(i) \
    158		PF1(i)					\
    159				PF1(i + 2)		\
    160		LD(i, 0)				\
    161			LD(i + 1, 1)			\
    162				LD(i + 2, 2)		\
    163					LD(i + 3, 3)	\
    164		PF2(i)					\
    165				PF2(i + 2)		\
    166		PF0(i + 4)				\
    167				PF0(i + 6)		\
    168		XO1(i, 0)				\
    169			XO1(i + 1, 1)			\
    170				XO1(i + 2, 2)		\
    171					XO1(i + 3, 3)	\
    172		XO2(i, 0)				\
    173			XO2(i + 1, 1)			\
    174				XO2(i + 2, 2)		\
    175					XO2(i + 3, 3)	\
    176		ST(i, 0)				\
    177			ST(i + 1, 1)			\
    178				ST(i + 2, 2)		\
    179					ST(i + 3, 3)	\
    180
    181
    182		PF0(0)
    183				PF0(2)
    184
    185	" .align 32			;\n"
    186	" 1:                            ;\n"
    187
    188		BLOCK(0)
    189		BLOCK(4)
    190		BLOCK(8)
    191		BLOCK(12)
    192
    193	"       add %[inc], %[p1]       ;\n"
    194	"       add %[inc], %[p2]       ;\n"
    195	"       add %[inc], %[p3]       ;\n"
    196	"       dec %[cnt]              ;\n"
    197	"       jnz 1b                  ;\n"
    198	: [cnt] "+r" (lines),
    199	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
    200	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    201	: "memory");
    202
    203	kernel_fpu_end();
    204}
    205
    206static void
    207xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
    208	       const unsigned long * __restrict p2,
    209	       const unsigned long * __restrict p3)
    210{
    211	unsigned long lines = bytes >> 8;
    212
    213	kernel_fpu_begin();
    214
    215	asm volatile(
    216#undef BLOCK
    217#define BLOCK(i)			\
    218		BLK64(PF0, LD, i)	\
    219		BLK64(PF1, XO1, i)	\
    220		BLK64(PF2, XO2, i)	\
    221		BLK64(NOP, ST, i)	\
    222
    223	" .align 32			;\n"
    224	" 1:                            ;\n"
    225
    226		BLOCK(0)
    227		BLOCK(4)
    228		BLOCK(8)
    229		BLOCK(12)
    230
    231	"       add %[inc], %[p1]       ;\n"
    232	"       add %[inc], %[p2]       ;\n"
    233	"       add %[inc], %[p3]       ;\n"
    234	"       dec %[cnt]              ;\n"
    235	"       jnz 1b                  ;\n"
    236	: [cnt] "+r" (lines),
    237	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
    238	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    239	: "memory");
    240
    241	kernel_fpu_end();
    242}
    243
    244static void
    245xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
    246	  const unsigned long * __restrict p2,
    247	  const unsigned long * __restrict p3,
    248	  const unsigned long * __restrict p4)
    249{
    250	unsigned long lines = bytes >> 8;
    251
    252	kernel_fpu_begin();
    253
    254	asm volatile(
    255#undef BLOCK
    256#define BLOCK(i) \
    257		PF1(i)					\
    258				PF1(i + 2)		\
    259		LD(i, 0)				\
    260			LD(i + 1, 1)			\
    261				LD(i + 2, 2)		\
    262					LD(i + 3, 3)	\
    263		PF2(i)					\
    264				PF2(i + 2)		\
    265		XO1(i, 0)				\
    266			XO1(i + 1, 1)			\
    267				XO1(i + 2, 2)		\
    268					XO1(i + 3, 3)	\
    269		PF3(i)					\
    270				PF3(i + 2)		\
    271		PF0(i + 4)				\
    272				PF0(i + 6)		\
    273		XO2(i, 0)				\
    274			XO2(i + 1, 1)			\
    275				XO2(i + 2, 2)		\
    276					XO2(i + 3, 3)	\
    277		XO3(i, 0)				\
    278			XO3(i + 1, 1)			\
    279				XO3(i + 2, 2)		\
    280					XO3(i + 3, 3)	\
    281		ST(i, 0)				\
    282			ST(i + 1, 1)			\
    283				ST(i + 2, 2)		\
    284					ST(i + 3, 3)	\
    285
    286
    287		PF0(0)
    288				PF0(2)
    289
    290	" .align 32			;\n"
    291	" 1:                            ;\n"
    292
    293		BLOCK(0)
    294		BLOCK(4)
    295		BLOCK(8)
    296		BLOCK(12)
    297
    298	"       add %[inc], %[p1]       ;\n"
    299	"       add %[inc], %[p2]       ;\n"
    300	"       add %[inc], %[p3]       ;\n"
    301	"       add %[inc], %[p4]       ;\n"
    302	"       dec %[cnt]              ;\n"
    303	"       jnz 1b                  ;\n"
    304	: [cnt] "+r" (lines), [p1] "+r" (p1),
    305	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
    306	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    307	: "memory");
    308
    309	kernel_fpu_end();
    310}
    311
    312static void
    313xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
    314	       const unsigned long * __restrict p2,
    315	       const unsigned long * __restrict p3,
    316	       const unsigned long * __restrict p4)
    317{
    318	unsigned long lines = bytes >> 8;
    319
    320	kernel_fpu_begin();
    321
    322	asm volatile(
    323#undef BLOCK
    324#define BLOCK(i)			\
    325		BLK64(PF0, LD, i)	\
    326		BLK64(PF1, XO1, i)	\
    327		BLK64(PF2, XO2, i)	\
    328		BLK64(PF3, XO3, i)	\
    329		BLK64(NOP, ST, i)	\
    330
    331	" .align 32			;\n"
    332	" 1:                            ;\n"
    333
    334		BLOCK(0)
    335		BLOCK(4)
    336		BLOCK(8)
    337		BLOCK(12)
    338
    339	"       add %[inc], %[p1]       ;\n"
    340	"       add %[inc], %[p2]       ;\n"
    341	"       add %[inc], %[p3]       ;\n"
    342	"       add %[inc], %[p4]       ;\n"
    343	"       dec %[cnt]              ;\n"
    344	"       jnz 1b                  ;\n"
    345	: [cnt] "+r" (lines), [p1] "+r" (p1),
    346	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
    347	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    348	: "memory");
    349
    350	kernel_fpu_end();
    351}
    352
    353static void
    354xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
    355	  const unsigned long * __restrict p2,
    356	  const unsigned long * __restrict p3,
    357	  const unsigned long * __restrict p4,
    358	  const unsigned long * __restrict p5)
    359{
    360	unsigned long lines = bytes >> 8;
    361
    362	kernel_fpu_begin();
    363
    364	asm volatile(
    365#undef BLOCK
    366#define BLOCK(i) \
    367		PF1(i)					\
    368				PF1(i + 2)		\
    369		LD(i, 0)				\
    370			LD(i + 1, 1)			\
    371				LD(i + 2, 2)		\
    372					LD(i + 3, 3)	\
    373		PF2(i)					\
    374				PF2(i + 2)		\
    375		XO1(i, 0)				\
    376			XO1(i + 1, 1)			\
    377				XO1(i + 2, 2)		\
    378					XO1(i + 3, 3)	\
    379		PF3(i)					\
    380				PF3(i + 2)		\
    381		XO2(i, 0)				\
    382			XO2(i + 1, 1)			\
    383				XO2(i + 2, 2)		\
    384					XO2(i + 3, 3)	\
    385		PF4(i)					\
    386				PF4(i + 2)		\
    387		PF0(i + 4)				\
    388				PF0(i + 6)		\
    389		XO3(i, 0)				\
    390			XO3(i + 1, 1)			\
    391				XO3(i + 2, 2)		\
    392					XO3(i + 3, 3)	\
    393		XO4(i, 0)				\
    394			XO4(i + 1, 1)			\
    395				XO4(i + 2, 2)		\
    396					XO4(i + 3, 3)	\
    397		ST(i, 0)				\
    398			ST(i + 1, 1)			\
    399				ST(i + 2, 2)		\
    400					ST(i + 3, 3)	\
    401
    402
    403		PF0(0)
    404				PF0(2)
    405
    406	" .align 32			;\n"
    407	" 1:                            ;\n"
    408
    409		BLOCK(0)
    410		BLOCK(4)
    411		BLOCK(8)
    412		BLOCK(12)
    413
    414	"       add %[inc], %[p1]       ;\n"
    415	"       add %[inc], %[p2]       ;\n"
    416	"       add %[inc], %[p3]       ;\n"
    417	"       add %[inc], %[p4]       ;\n"
    418	"       add %[inc], %[p5]       ;\n"
    419	"       dec %[cnt]              ;\n"
    420	"       jnz 1b                  ;\n"
    421	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
    422	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
    423	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    424	: "memory");
    425
    426	kernel_fpu_end();
    427}
    428
    429static void
    430xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
    431	       const unsigned long * __restrict p2,
    432	       const unsigned long * __restrict p3,
    433	       const unsigned long * __restrict p4,
    434	       const unsigned long * __restrict p5)
    435{
    436	unsigned long lines = bytes >> 8;
    437
    438	kernel_fpu_begin();
    439
    440	asm volatile(
    441#undef BLOCK
    442#define BLOCK(i)			\
    443		BLK64(PF0, LD, i)	\
    444		BLK64(PF1, XO1, i)	\
    445		BLK64(PF2, XO2, i)	\
    446		BLK64(PF3, XO3, i)	\
    447		BLK64(PF4, XO4, i)	\
    448		BLK64(NOP, ST, i)	\
    449
    450	" .align 32			;\n"
    451	" 1:                            ;\n"
    452
    453		BLOCK(0)
    454		BLOCK(4)
    455		BLOCK(8)
    456		BLOCK(12)
    457
    458	"       add %[inc], %[p1]       ;\n"
    459	"       add %[inc], %[p2]       ;\n"
    460	"       add %[inc], %[p3]       ;\n"
    461	"       add %[inc], %[p4]       ;\n"
    462	"       add %[inc], %[p5]       ;\n"
    463	"       dec %[cnt]              ;\n"
    464	"       jnz 1b                  ;\n"
    465	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
    466	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
    467	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
    468	: "memory");
    469
    470	kernel_fpu_end();
    471}
    472
    473static struct xor_block_template xor_block_sse_pf64 = {
    474	.name = "prefetch64-sse",
    475	.do_2 = xor_sse_2_pf64,
    476	.do_3 = xor_sse_3_pf64,
    477	.do_4 = xor_sse_4_pf64,
    478	.do_5 = xor_sse_5_pf64,
    479};
    480
    481#undef LD
    482#undef XO1
    483#undef XO2
    484#undef XO3
    485#undef XO4
    486#undef ST
    487#undef NOP
    488#undef BLK64
    489#undef BLOCK
    490
    491#undef XOR_CONSTANT_CONSTRAINT
    492
    493#ifdef CONFIG_X86_32
    494# include <asm/xor_32.h>
    495#else
    496# include <asm/xor_64.h>
    497#endif
    498
    499#define XOR_SELECT_TEMPLATE(FASTEST) \
    500	AVX_SELECT(FASTEST)
    501
    502#endif /* _ASM_X86_XOR_H */