cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

memcpy_power7.S (10214B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 *
      4 * Copyright (C) IBM Corporation, 2012
      5 *
      6 * Author: Anton Blanchard <anton@au.ibm.com>
      7 */
      8#include <asm/ppc_asm.h>
      9
     10#ifndef SELFTEST_CASE
     11/* 0 == don't use VMX, 1 == use VMX */
     12#define SELFTEST_CASE	0
     13#endif
     14
     15#ifdef __BIG_ENDIAN__
     16#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
     17#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
     18#else
     19#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
     20#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
     21#endif
     22
     23_GLOBAL(memcpy_power7)
     24	cmpldi	r5,16
     25	cmpldi	cr1,r5,4096
     26	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
     27	blt	.Lshort_copy
     28
     29#ifdef CONFIG_ALTIVEC
     30test_feature = SELFTEST_CASE
     31BEGIN_FTR_SECTION
     32	bgt	cr1, .Lvmx_copy
     33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
     34#endif
     35
     36.Lnonvmx_copy:
     37	/* Get the source 8B aligned */
     38	neg	r6,r4
     39	mtocrf	0x01,r6
     40	clrldi	r6,r6,(64-3)
     41
     42	bf	cr7*4+3,1f
     43	lbz	r0,0(r4)
     44	addi	r4,r4,1
     45	stb	r0,0(r3)
     46	addi	r3,r3,1
     47
     481:	bf	cr7*4+2,2f
     49	lhz	r0,0(r4)
     50	addi	r4,r4,2
     51	sth	r0,0(r3)
     52	addi	r3,r3,2
     53
     542:	bf	cr7*4+1,3f
     55	lwz	r0,0(r4)
     56	addi	r4,r4,4
     57	stw	r0,0(r3)
     58	addi	r3,r3,4
     59
     603:	sub	r5,r5,r6
     61	cmpldi	r5,128
     62	blt	5f
     63
     64	mflr	r0
     65	stdu	r1,-STACKFRAMESIZE(r1)
     66	std	r14,STK_REG(R14)(r1)
     67	std	r15,STK_REG(R15)(r1)
     68	std	r16,STK_REG(R16)(r1)
     69	std	r17,STK_REG(R17)(r1)
     70	std	r18,STK_REG(R18)(r1)
     71	std	r19,STK_REG(R19)(r1)
     72	std	r20,STK_REG(R20)(r1)
     73	std	r21,STK_REG(R21)(r1)
     74	std	r22,STK_REG(R22)(r1)
     75	std	r0,STACKFRAMESIZE+16(r1)
     76
     77	srdi	r6,r5,7
     78	mtctr	r6
     79
     80	/* Now do cacheline (128B) sized loads and stores. */
     81	.align	5
     824:
     83	ld	r0,0(r4)
     84	ld	r6,8(r4)
     85	ld	r7,16(r4)
     86	ld	r8,24(r4)
     87	ld	r9,32(r4)
     88	ld	r10,40(r4)
     89	ld	r11,48(r4)
     90	ld	r12,56(r4)
     91	ld	r14,64(r4)
     92	ld	r15,72(r4)
     93	ld	r16,80(r4)
     94	ld	r17,88(r4)
     95	ld	r18,96(r4)
     96	ld	r19,104(r4)
     97	ld	r20,112(r4)
     98	ld	r21,120(r4)
     99	addi	r4,r4,128
    100	std	r0,0(r3)
    101	std	r6,8(r3)
    102	std	r7,16(r3)
    103	std	r8,24(r3)
    104	std	r9,32(r3)
    105	std	r10,40(r3)
    106	std	r11,48(r3)
    107	std	r12,56(r3)
    108	std	r14,64(r3)
    109	std	r15,72(r3)
    110	std	r16,80(r3)
    111	std	r17,88(r3)
    112	std	r18,96(r3)
    113	std	r19,104(r3)
    114	std	r20,112(r3)
    115	std	r21,120(r3)
    116	addi	r3,r3,128
    117	bdnz	4b
    118
    119	clrldi	r5,r5,(64-7)
    120
    121	ld	r14,STK_REG(R14)(r1)
    122	ld	r15,STK_REG(R15)(r1)
    123	ld	r16,STK_REG(R16)(r1)
    124	ld	r17,STK_REG(R17)(r1)
    125	ld	r18,STK_REG(R18)(r1)
    126	ld	r19,STK_REG(R19)(r1)
    127	ld	r20,STK_REG(R20)(r1)
    128	ld	r21,STK_REG(R21)(r1)
    129	ld	r22,STK_REG(R22)(r1)
    130	addi	r1,r1,STACKFRAMESIZE
    131
    132	/* Up to 127B to go */
    1335:	srdi	r6,r5,4
    134	mtocrf	0x01,r6
    135
    1366:	bf	cr7*4+1,7f
    137	ld	r0,0(r4)
    138	ld	r6,8(r4)
    139	ld	r7,16(r4)
    140	ld	r8,24(r4)
    141	ld	r9,32(r4)
    142	ld	r10,40(r4)
    143	ld	r11,48(r4)
    144	ld	r12,56(r4)
    145	addi	r4,r4,64
    146	std	r0,0(r3)
    147	std	r6,8(r3)
    148	std	r7,16(r3)
    149	std	r8,24(r3)
    150	std	r9,32(r3)
    151	std	r10,40(r3)
    152	std	r11,48(r3)
    153	std	r12,56(r3)
    154	addi	r3,r3,64
    155
    156	/* Up to 63B to go */
    1577:	bf	cr7*4+2,8f
    158	ld	r0,0(r4)
    159	ld	r6,8(r4)
    160	ld	r7,16(r4)
    161	ld	r8,24(r4)
    162	addi	r4,r4,32
    163	std	r0,0(r3)
    164	std	r6,8(r3)
    165	std	r7,16(r3)
    166	std	r8,24(r3)
    167	addi	r3,r3,32
    168
    169	/* Up to 31B to go */
    1708:	bf	cr7*4+3,9f
    171	ld	r0,0(r4)
    172	ld	r6,8(r4)
    173	addi	r4,r4,16
    174	std	r0,0(r3)
    175	std	r6,8(r3)
    176	addi	r3,r3,16
    177
    1789:	clrldi	r5,r5,(64-4)
    179
    180	/* Up to 15B to go */
    181.Lshort_copy:
    182	mtocrf	0x01,r5
    183	bf	cr7*4+0,12f
    184	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
    185	lwz	r6,4(r4)
    186	addi	r4,r4,8
    187	stw	r0,0(r3)
    188	stw	r6,4(r3)
    189	addi	r3,r3,8
    190
    19112:	bf	cr7*4+1,13f
    192	lwz	r0,0(r4)
    193	addi	r4,r4,4
    194	stw	r0,0(r3)
    195	addi	r3,r3,4
    196
    19713:	bf	cr7*4+2,14f
    198	lhz	r0,0(r4)
    199	addi	r4,r4,2
    200	sth	r0,0(r3)
    201	addi	r3,r3,2
    202
    20314:	bf	cr7*4+3,15f
    204	lbz	r0,0(r4)
    205	stb	r0,0(r3)
    206
    20715:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
    208	blr
    209
    210.Lunwind_stack_nonvmx_copy:
    211	addi	r1,r1,STACKFRAMESIZE
    212	b	.Lnonvmx_copy
    213
    214.Lvmx_copy:
    215#ifdef CONFIG_ALTIVEC
    216	mflr	r0
    217	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
    218	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
    219	std	r0,16(r1)
    220	stdu	r1,-STACKFRAMESIZE(r1)
    221	bl	enter_vmx_ops
    222	cmpwi	cr1,r3,0
    223	ld	r0,STACKFRAMESIZE+16(r1)
    224	ld	r3,STK_REG(R31)(r1)
    225	ld	r4,STK_REG(R30)(r1)
    226	ld	r5,STK_REG(R29)(r1)
    227	mtlr	r0
    228
    229	/*
    230	 * We prefetch both the source and destination using enhanced touch
    231	 * instructions. We use a stream ID of 0 for the load side and
    232	 * 1 for the store side.
    233	 */
    234	clrrdi	r6,r4,7
    235	clrrdi	r9,r3,7
    236	ori	r9,r9,1		/* stream=1 */
    237
    238	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
    239	cmpldi	r7,0x3FF
    240	ble	1f
    241	li	r7,0x3FF
    2421:	lis	r0,0x0E00	/* depth=7 */
    243	sldi	r7,r7,7
    244	or	r7,r7,r0
    245	ori	r10,r7,1	/* stream=1 */
    246
    247	lis	r8,0x8000	/* GO=1 */
    248	clrldi	r8,r8,32
    249
    250	dcbt	0,r6,0b01000
    251	dcbt	0,r7,0b01010
    252	dcbtst	0,r9,0b01000
    253	dcbtst	0,r10,0b01010
    254	eieio
    255	dcbt	0,r8,0b01010	/* GO */
    256
    257	beq	cr1,.Lunwind_stack_nonvmx_copy
    258
    259	/*
    260	 * If source and destination are not relatively aligned we use a
    261	 * slower permute loop.
    262	 */
    263	xor	r6,r4,r3
    264	rldicl.	r6,r6,0,(64-4)
    265	bne	.Lvmx_unaligned_copy
    266
    267	/* Get the destination 16B aligned */
    268	neg	r6,r3
    269	mtocrf	0x01,r6
    270	clrldi	r6,r6,(64-4)
    271
    272	bf	cr7*4+3,1f
    273	lbz	r0,0(r4)
    274	addi	r4,r4,1
    275	stb	r0,0(r3)
    276	addi	r3,r3,1
    277
    2781:	bf	cr7*4+2,2f
    279	lhz	r0,0(r4)
    280	addi	r4,r4,2
    281	sth	r0,0(r3)
    282	addi	r3,r3,2
    283
    2842:	bf	cr7*4+1,3f
    285	lwz	r0,0(r4)
    286	addi	r4,r4,4
    287	stw	r0,0(r3)
    288	addi	r3,r3,4
    289
    2903:	bf	cr7*4+0,4f
    291	ld	r0,0(r4)
    292	addi	r4,r4,8
    293	std	r0,0(r3)
    294	addi	r3,r3,8
    295
    2964:	sub	r5,r5,r6
    297
    298	/* Get the desination 128B aligned */
    299	neg	r6,r3
    300	srdi	r7,r6,4
    301	mtocrf	0x01,r7
    302	clrldi	r6,r6,(64-7)
    303
    304	li	r9,16
    305	li	r10,32
    306	li	r11,48
    307
    308	bf	cr7*4+3,5f
    309	lvx	v1,0,r4
    310	addi	r4,r4,16
    311	stvx	v1,0,r3
    312	addi	r3,r3,16
    313
    3145:	bf	cr7*4+2,6f
    315	lvx	v1,0,r4
    316	lvx	v0,r4,r9
    317	addi	r4,r4,32
    318	stvx	v1,0,r3
    319	stvx	v0,r3,r9
    320	addi	r3,r3,32
    321
    3226:	bf	cr7*4+1,7f
    323	lvx	v3,0,r4
    324	lvx	v2,r4,r9
    325	lvx	v1,r4,r10
    326	lvx	v0,r4,r11
    327	addi	r4,r4,64
    328	stvx	v3,0,r3
    329	stvx	v2,r3,r9
    330	stvx	v1,r3,r10
    331	stvx	v0,r3,r11
    332	addi	r3,r3,64
    333
    3347:	sub	r5,r5,r6
    335	srdi	r6,r5,7
    336
    337	std	r14,STK_REG(R14)(r1)
    338	std	r15,STK_REG(R15)(r1)
    339	std	r16,STK_REG(R16)(r1)
    340
    341	li	r12,64
    342	li	r14,80
    343	li	r15,96
    344	li	r16,112
    345
    346	mtctr	r6
    347
    348	/*
    349	 * Now do cacheline sized loads and stores. By this stage the
    350	 * cacheline stores are also cacheline aligned.
    351	 */
    352	.align	5
    3538:
    354	lvx	v7,0,r4
    355	lvx	v6,r4,r9
    356	lvx	v5,r4,r10
    357	lvx	v4,r4,r11
    358	lvx	v3,r4,r12
    359	lvx	v2,r4,r14
    360	lvx	v1,r4,r15
    361	lvx	v0,r4,r16
    362	addi	r4,r4,128
    363	stvx	v7,0,r3
    364	stvx	v6,r3,r9
    365	stvx	v5,r3,r10
    366	stvx	v4,r3,r11
    367	stvx	v3,r3,r12
    368	stvx	v2,r3,r14
    369	stvx	v1,r3,r15
    370	stvx	v0,r3,r16
    371	addi	r3,r3,128
    372	bdnz	8b
    373
    374	ld	r14,STK_REG(R14)(r1)
    375	ld	r15,STK_REG(R15)(r1)
    376	ld	r16,STK_REG(R16)(r1)
    377
    378	/* Up to 127B to go */
    379	clrldi	r5,r5,(64-7)
    380	srdi	r6,r5,4
    381	mtocrf	0x01,r6
    382
    383	bf	cr7*4+1,9f
    384	lvx	v3,0,r4
    385	lvx	v2,r4,r9
    386	lvx	v1,r4,r10
    387	lvx	v0,r4,r11
    388	addi	r4,r4,64
    389	stvx	v3,0,r3
    390	stvx	v2,r3,r9
    391	stvx	v1,r3,r10
    392	stvx	v0,r3,r11
    393	addi	r3,r3,64
    394
    3959:	bf	cr7*4+2,10f
    396	lvx	v1,0,r4
    397	lvx	v0,r4,r9
    398	addi	r4,r4,32
    399	stvx	v1,0,r3
    400	stvx	v0,r3,r9
    401	addi	r3,r3,32
    402
    40310:	bf	cr7*4+3,11f
    404	lvx	v1,0,r4
    405	addi	r4,r4,16
    406	stvx	v1,0,r3
    407	addi	r3,r3,16
    408
    409	/* Up to 15B to go */
    41011:	clrldi	r5,r5,(64-4)
    411	mtocrf	0x01,r5
    412	bf	cr7*4+0,12f
    413	ld	r0,0(r4)
    414	addi	r4,r4,8
    415	std	r0,0(r3)
    416	addi	r3,r3,8
    417
    41812:	bf	cr7*4+1,13f
    419	lwz	r0,0(r4)
    420	addi	r4,r4,4
    421	stw	r0,0(r3)
    422	addi	r3,r3,4
    423
    42413:	bf	cr7*4+2,14f
    425	lhz	r0,0(r4)
    426	addi	r4,r4,2
    427	sth	r0,0(r3)
    428	addi	r3,r3,2
    429
    43014:	bf	cr7*4+3,15f
    431	lbz	r0,0(r4)
    432	stb	r0,0(r3)
    433
    43415:	addi	r1,r1,STACKFRAMESIZE
    435	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
    436	b	exit_vmx_ops		/* tail call optimise */
    437
    438.Lvmx_unaligned_copy:
    439	/* Get the destination 16B aligned */
    440	neg	r6,r3
    441	mtocrf	0x01,r6
    442	clrldi	r6,r6,(64-4)
    443
    444	bf	cr7*4+3,1f
    445	lbz	r0,0(r4)
    446	addi	r4,r4,1
    447	stb	r0,0(r3)
    448	addi	r3,r3,1
    449
    4501:	bf	cr7*4+2,2f
    451	lhz	r0,0(r4)
    452	addi	r4,r4,2
    453	sth	r0,0(r3)
    454	addi	r3,r3,2
    455
    4562:	bf	cr7*4+1,3f
    457	lwz	r0,0(r4)
    458	addi	r4,r4,4
    459	stw	r0,0(r3)
    460	addi	r3,r3,4
    461
    4623:	bf	cr7*4+0,4f
    463	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
    464	lwz	r7,4(r4)
    465	addi	r4,r4,8
    466	stw	r0,0(r3)
    467	stw	r7,4(r3)
    468	addi	r3,r3,8
    469
    4704:	sub	r5,r5,r6
    471
    472	/* Get the desination 128B aligned */
    473	neg	r6,r3
    474	srdi	r7,r6,4
    475	mtocrf	0x01,r7
    476	clrldi	r6,r6,(64-7)
    477
    478	li	r9,16
    479	li	r10,32
    480	li	r11,48
    481
    482	LVS(v16,0,r4)		/* Setup permute control vector */
    483	lvx	v0,0,r4
    484	addi	r4,r4,16
    485
    486	bf	cr7*4+3,5f
    487	lvx	v1,0,r4
    488	VPERM(v8,v0,v1,v16)
    489	addi	r4,r4,16
    490	stvx	v8,0,r3
    491	addi	r3,r3,16
    492	vor	v0,v1,v1
    493
    4945:	bf	cr7*4+2,6f
    495	lvx	v1,0,r4
    496	VPERM(v8,v0,v1,v16)
    497	lvx	v0,r4,r9
    498	VPERM(v9,v1,v0,v16)
    499	addi	r4,r4,32
    500	stvx	v8,0,r3
    501	stvx	v9,r3,r9
    502	addi	r3,r3,32
    503
    5046:	bf	cr7*4+1,7f
    505	lvx	v3,0,r4
    506	VPERM(v8,v0,v3,v16)
    507	lvx	v2,r4,r9
    508	VPERM(v9,v3,v2,v16)
    509	lvx	v1,r4,r10
    510	VPERM(v10,v2,v1,v16)
    511	lvx	v0,r4,r11
    512	VPERM(v11,v1,v0,v16)
    513	addi	r4,r4,64
    514	stvx	v8,0,r3
    515	stvx	v9,r3,r9
    516	stvx	v10,r3,r10
    517	stvx	v11,r3,r11
    518	addi	r3,r3,64
    519
    5207:	sub	r5,r5,r6
    521	srdi	r6,r5,7
    522
    523	std	r14,STK_REG(R14)(r1)
    524	std	r15,STK_REG(R15)(r1)
    525	std	r16,STK_REG(R16)(r1)
    526
    527	li	r12,64
    528	li	r14,80
    529	li	r15,96
    530	li	r16,112
    531
    532	mtctr	r6
    533
    534	/*
    535	 * Now do cacheline sized loads and stores. By this stage the
    536	 * cacheline stores are also cacheline aligned.
    537	 */
    538	.align	5
    5398:
    540	lvx	v7,0,r4
    541	VPERM(v8,v0,v7,v16)
    542	lvx	v6,r4,r9
    543	VPERM(v9,v7,v6,v16)
    544	lvx	v5,r4,r10
    545	VPERM(v10,v6,v5,v16)
    546	lvx	v4,r4,r11
    547	VPERM(v11,v5,v4,v16)
    548	lvx	v3,r4,r12
    549	VPERM(v12,v4,v3,v16)
    550	lvx	v2,r4,r14
    551	VPERM(v13,v3,v2,v16)
    552	lvx	v1,r4,r15
    553	VPERM(v14,v2,v1,v16)
    554	lvx	v0,r4,r16
    555	VPERM(v15,v1,v0,v16)
    556	addi	r4,r4,128
    557	stvx	v8,0,r3
    558	stvx	v9,r3,r9
    559	stvx	v10,r3,r10
    560	stvx	v11,r3,r11
    561	stvx	v12,r3,r12
    562	stvx	v13,r3,r14
    563	stvx	v14,r3,r15
    564	stvx	v15,r3,r16
    565	addi	r3,r3,128
    566	bdnz	8b
    567
    568	ld	r14,STK_REG(R14)(r1)
    569	ld	r15,STK_REG(R15)(r1)
    570	ld	r16,STK_REG(R16)(r1)
    571
    572	/* Up to 127B to go */
    573	clrldi	r5,r5,(64-7)
    574	srdi	r6,r5,4
    575	mtocrf	0x01,r6
    576
    577	bf	cr7*4+1,9f
    578	lvx	v3,0,r4
    579	VPERM(v8,v0,v3,v16)
    580	lvx	v2,r4,r9
    581	VPERM(v9,v3,v2,v16)
    582	lvx	v1,r4,r10
    583	VPERM(v10,v2,v1,v16)
    584	lvx	v0,r4,r11
    585	VPERM(v11,v1,v0,v16)
    586	addi	r4,r4,64
    587	stvx	v8,0,r3
    588	stvx	v9,r3,r9
    589	stvx	v10,r3,r10
    590	stvx	v11,r3,r11
    591	addi	r3,r3,64
    592
    5939:	bf	cr7*4+2,10f
    594	lvx	v1,0,r4
    595	VPERM(v8,v0,v1,v16)
    596	lvx	v0,r4,r9
    597	VPERM(v9,v1,v0,v16)
    598	addi	r4,r4,32
    599	stvx	v8,0,r3
    600	stvx	v9,r3,r9
    601	addi	r3,r3,32
    602
    60310:	bf	cr7*4+3,11f
    604	lvx	v1,0,r4
    605	VPERM(v8,v0,v1,v16)
    606	addi	r4,r4,16
    607	stvx	v8,0,r3
    608	addi	r3,r3,16
    609
    610	/* Up to 15B to go */
    61111:	clrldi	r5,r5,(64-4)
    612	addi	r4,r4,-16	/* Unwind the +16 load offset */
    613	mtocrf	0x01,r5
    614	bf	cr7*4+0,12f
    615	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
    616	lwz	r6,4(r4)
    617	addi	r4,r4,8
    618	stw	r0,0(r3)
    619	stw	r6,4(r3)
    620	addi	r3,r3,8
    621
    62212:	bf	cr7*4+1,13f
    623	lwz	r0,0(r4)
    624	addi	r4,r4,4
    625	stw	r0,0(r3)
    626	addi	r3,r3,4
    627
    62813:	bf	cr7*4+2,14f
    629	lhz	r0,0(r4)
    630	addi	r4,r4,2
    631	sth	r0,0(r3)
    632	addi	r3,r3,2
    633
    63414:	bf	cr7*4+3,15f
    635	lbz	r0,0(r4)
    636	stb	r0,0(r3)
    637
    63815:	addi	r1,r1,STACKFRAMESIZE
    639	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
    640	b	exit_vmx_ops		/* tail call optimise */
    641#endif /* CONFIG_ALTIVEC */