cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

copyuser_power7.S (12150B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 *
      4 * Copyright (C) IBM Corporation, 2011
      5 *
      6 * Author: Anton Blanchard <anton@au.ibm.com>
      7 */
      8#include <asm/ppc_asm.h>
      9
     10#ifndef SELFTEST_CASE
     11/* 0 == don't use VMX, 1 == use VMX */
     12#define SELFTEST_CASE	0
     13#endif
     14
     15#ifdef __BIG_ENDIAN__
     16#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
     17#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
     18#else
     19#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
     20#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
     21#endif
     22
     23	.macro err1
     24100:
     25	EX_TABLE(100b,.Ldo_err1)
     26	.endm
     27
     28	.macro err2
     29200:
     30	EX_TABLE(200b,.Ldo_err2)
     31	.endm
     32
     33#ifdef CONFIG_ALTIVEC
     34	.macro err3
     35300:
     36	EX_TABLE(300b,.Ldo_err3)
     37	.endm
     38
     39	.macro err4
     40400:
     41	EX_TABLE(400b,.Ldo_err4)
     42	.endm
     43
     44
     45.Ldo_err4:
     46	ld	r16,STK_REG(R16)(r1)
     47	ld	r15,STK_REG(R15)(r1)
     48	ld	r14,STK_REG(R14)(r1)
     49.Ldo_err3:
     50	bl	exit_vmx_usercopy
     51	ld	r0,STACKFRAMESIZE+16(r1)
     52	mtlr	r0
     53	b	.Lexit
     54#endif /* CONFIG_ALTIVEC */
     55
     56.Ldo_err2:
     57	ld	r22,STK_REG(R22)(r1)
     58	ld	r21,STK_REG(R21)(r1)
     59	ld	r20,STK_REG(R20)(r1)
     60	ld	r19,STK_REG(R19)(r1)
     61	ld	r18,STK_REG(R18)(r1)
     62	ld	r17,STK_REG(R17)(r1)
     63	ld	r16,STK_REG(R16)(r1)
     64	ld	r15,STK_REG(R15)(r1)
     65	ld	r14,STK_REG(R14)(r1)
     66.Lexit:
     67	addi	r1,r1,STACKFRAMESIZE
     68.Ldo_err1:
     69	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
     70	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
     71	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
     72	b	__copy_tofrom_user_base
     73
     74
     75_GLOBAL(__copy_tofrom_user_power7)
     76	cmpldi	r5,16
     77	cmpldi	cr1,r5,3328
     78
     79	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
     80	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
     81	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
     82
     83	blt	.Lshort_copy
     84
     85#ifdef CONFIG_ALTIVEC
     86test_feature = SELFTEST_CASE
     87BEGIN_FTR_SECTION
     88	bgt	cr1,.Lvmx_copy
     89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
     90#endif
     91
     92.Lnonvmx_copy:
     93	/* Get the source 8B aligned */
     94	neg	r6,r4
     95	mtocrf	0x01,r6
     96	clrldi	r6,r6,(64-3)
     97
     98	bf	cr7*4+3,1f
     99err1;	lbz	r0,0(r4)
    100	addi	r4,r4,1
    101err1;	stb	r0,0(r3)
    102	addi	r3,r3,1
    103
    1041:	bf	cr7*4+2,2f
    105err1;	lhz	r0,0(r4)
    106	addi	r4,r4,2
    107err1;	sth	r0,0(r3)
    108	addi	r3,r3,2
    109
    1102:	bf	cr7*4+1,3f
    111err1;	lwz	r0,0(r4)
    112	addi	r4,r4,4
    113err1;	stw	r0,0(r3)
    114	addi	r3,r3,4
    115
    1163:	sub	r5,r5,r6
    117	cmpldi	r5,128
    118	blt	5f
    119
    120	mflr	r0
    121	stdu	r1,-STACKFRAMESIZE(r1)
    122	std	r14,STK_REG(R14)(r1)
    123	std	r15,STK_REG(R15)(r1)
    124	std	r16,STK_REG(R16)(r1)
    125	std	r17,STK_REG(R17)(r1)
    126	std	r18,STK_REG(R18)(r1)
    127	std	r19,STK_REG(R19)(r1)
    128	std	r20,STK_REG(R20)(r1)
    129	std	r21,STK_REG(R21)(r1)
    130	std	r22,STK_REG(R22)(r1)
    131	std	r0,STACKFRAMESIZE+16(r1)
    132
    133	srdi	r6,r5,7
    134	mtctr	r6
    135
    136	/* Now do cacheline (128B) sized loads and stores. */
    137	.align	5
    1384:
    139err2;	ld	r0,0(r4)
    140err2;	ld	r6,8(r4)
    141err2;	ld	r7,16(r4)
    142err2;	ld	r8,24(r4)
    143err2;	ld	r9,32(r4)
    144err2;	ld	r10,40(r4)
    145err2;	ld	r11,48(r4)
    146err2;	ld	r12,56(r4)
    147err2;	ld	r14,64(r4)
    148err2;	ld	r15,72(r4)
    149err2;	ld	r16,80(r4)
    150err2;	ld	r17,88(r4)
    151err2;	ld	r18,96(r4)
    152err2;	ld	r19,104(r4)
    153err2;	ld	r20,112(r4)
    154err2;	ld	r21,120(r4)
    155	addi	r4,r4,128
    156err2;	std	r0,0(r3)
    157err2;	std	r6,8(r3)
    158err2;	std	r7,16(r3)
    159err2;	std	r8,24(r3)
    160err2;	std	r9,32(r3)
    161err2;	std	r10,40(r3)
    162err2;	std	r11,48(r3)
    163err2;	std	r12,56(r3)
    164err2;	std	r14,64(r3)
    165err2;	std	r15,72(r3)
    166err2;	std	r16,80(r3)
    167err2;	std	r17,88(r3)
    168err2;	std	r18,96(r3)
    169err2;	std	r19,104(r3)
    170err2;	std	r20,112(r3)
    171err2;	std	r21,120(r3)
    172	addi	r3,r3,128
    173	bdnz	4b
    174
    175	clrldi	r5,r5,(64-7)
    176
    177	ld	r14,STK_REG(R14)(r1)
    178	ld	r15,STK_REG(R15)(r1)
    179	ld	r16,STK_REG(R16)(r1)
    180	ld	r17,STK_REG(R17)(r1)
    181	ld	r18,STK_REG(R18)(r1)
    182	ld	r19,STK_REG(R19)(r1)
    183	ld	r20,STK_REG(R20)(r1)
    184	ld	r21,STK_REG(R21)(r1)
    185	ld	r22,STK_REG(R22)(r1)
    186	addi	r1,r1,STACKFRAMESIZE
    187
    188	/* Up to 127B to go */
    1895:	srdi	r6,r5,4
    190	mtocrf	0x01,r6
    191
    1926:	bf	cr7*4+1,7f
    193err1;	ld	r0,0(r4)
    194err1;	ld	r6,8(r4)
    195err1;	ld	r7,16(r4)
    196err1;	ld	r8,24(r4)
    197err1;	ld	r9,32(r4)
    198err1;	ld	r10,40(r4)
    199err1;	ld	r11,48(r4)
    200err1;	ld	r12,56(r4)
    201	addi	r4,r4,64
    202err1;	std	r0,0(r3)
    203err1;	std	r6,8(r3)
    204err1;	std	r7,16(r3)
    205err1;	std	r8,24(r3)
    206err1;	std	r9,32(r3)
    207err1;	std	r10,40(r3)
    208err1;	std	r11,48(r3)
    209err1;	std	r12,56(r3)
    210	addi	r3,r3,64
    211
    212	/* Up to 63B to go */
    2137:	bf	cr7*4+2,8f
    214err1;	ld	r0,0(r4)
    215err1;	ld	r6,8(r4)
    216err1;	ld	r7,16(r4)
    217err1;	ld	r8,24(r4)
    218	addi	r4,r4,32
    219err1;	std	r0,0(r3)
    220err1;	std	r6,8(r3)
    221err1;	std	r7,16(r3)
    222err1;	std	r8,24(r3)
    223	addi	r3,r3,32
    224
    225	/* Up to 31B to go */
    2268:	bf	cr7*4+3,9f
    227err1;	ld	r0,0(r4)
    228err1;	ld	r6,8(r4)
    229	addi	r4,r4,16
    230err1;	std	r0,0(r3)
    231err1;	std	r6,8(r3)
    232	addi	r3,r3,16
    233
    2349:	clrldi	r5,r5,(64-4)
    235
    236	/* Up to 15B to go */
    237.Lshort_copy:
    238	mtocrf	0x01,r5
    239	bf	cr7*4+0,12f
    240err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
    241err1;	lwz	r6,4(r4)
    242	addi	r4,r4,8
    243err1;	stw	r0,0(r3)
    244err1;	stw	r6,4(r3)
    245	addi	r3,r3,8
    246
    24712:	bf	cr7*4+1,13f
    248err1;	lwz	r0,0(r4)
    249	addi	r4,r4,4
    250err1;	stw	r0,0(r3)
    251	addi	r3,r3,4
    252
    25313:	bf	cr7*4+2,14f
    254err1;	lhz	r0,0(r4)
    255	addi	r4,r4,2
    256err1;	sth	r0,0(r3)
    257	addi	r3,r3,2
    258
    25914:	bf	cr7*4+3,15f
    260err1;	lbz	r0,0(r4)
    261err1;	stb	r0,0(r3)
    262
    26315:	li	r3,0
    264	blr
    265
    266.Lunwind_stack_nonvmx_copy:
    267	addi	r1,r1,STACKFRAMESIZE
    268	b	.Lnonvmx_copy
    269
    270.Lvmx_copy:
    271#ifdef CONFIG_ALTIVEC
    272	mflr	r0
    273	std	r0,16(r1)
    274	stdu	r1,-STACKFRAMESIZE(r1)
    275	bl	enter_vmx_usercopy
    276	cmpwi	cr1,r3,0
    277	ld	r0,STACKFRAMESIZE+16(r1)
    278	ld	r3,STK_REG(R31)(r1)
    279	ld	r4,STK_REG(R30)(r1)
    280	ld	r5,STK_REG(R29)(r1)
    281	mtlr	r0
    282
    283	/*
    284	 * We prefetch both the source and destination using enhanced touch
    285	 * instructions. We use a stream ID of 0 for the load side and
    286	 * 1 for the store side.
    287	 */
    288	clrrdi	r6,r4,7
    289	clrrdi	r9,r3,7
    290	ori	r9,r9,1		/* stream=1 */
    291
    292	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
    293	cmpldi	r7,0x3FF
    294	ble	1f
    295	li	r7,0x3FF
    2961:	lis	r0,0x0E00	/* depth=7 */
    297	sldi	r7,r7,7
    298	or	r7,r7,r0
    299	ori	r10,r7,1	/* stream=1 */
    300
    301	lis	r8,0x8000	/* GO=1 */
    302	clrldi	r8,r8,32
    303
    304	/* setup read stream 0 */
    305	dcbt	0,r6,0b01000   /* addr from */
    306	dcbt	0,r7,0b01010   /* length and depth from */
    307	/* setup write stream 1 */
    308	dcbtst	0,r9,0b01000   /* addr to */
    309	dcbtst	0,r10,0b01010  /* length and depth to */
    310	eieio
    311	dcbt	0,r8,0b01010	/* all streams GO */
    312
    313	beq	cr1,.Lunwind_stack_nonvmx_copy
    314
    315	/*
    316	 * If source and destination are not relatively aligned we use a
    317	 * slower permute loop.
    318	 */
    319	xor	r6,r4,r3
    320	rldicl.	r6,r6,0,(64-4)
    321	bne	.Lvmx_unaligned_copy
    322
    323	/* Get the destination 16B aligned */
    324	neg	r6,r3
    325	mtocrf	0x01,r6
    326	clrldi	r6,r6,(64-4)
    327
    328	bf	cr7*4+3,1f
    329err3;	lbz	r0,0(r4)
    330	addi	r4,r4,1
    331err3;	stb	r0,0(r3)
    332	addi	r3,r3,1
    333
    3341:	bf	cr7*4+2,2f
    335err3;	lhz	r0,0(r4)
    336	addi	r4,r4,2
    337err3;	sth	r0,0(r3)
    338	addi	r3,r3,2
    339
    3402:	bf	cr7*4+1,3f
    341err3;	lwz	r0,0(r4)
    342	addi	r4,r4,4
    343err3;	stw	r0,0(r3)
    344	addi	r3,r3,4
    345
    3463:	bf	cr7*4+0,4f
    347err3;	ld	r0,0(r4)
    348	addi	r4,r4,8
    349err3;	std	r0,0(r3)
    350	addi	r3,r3,8
    351
    3524:	sub	r5,r5,r6
    353
    354	/* Get the desination 128B aligned */
    355	neg	r6,r3
    356	srdi	r7,r6,4
    357	mtocrf	0x01,r7
    358	clrldi	r6,r6,(64-7)
    359
    360	li	r9,16
    361	li	r10,32
    362	li	r11,48
    363
    364	bf	cr7*4+3,5f
    365err3;	lvx	v1,0,r4
    366	addi	r4,r4,16
    367err3;	stvx	v1,0,r3
    368	addi	r3,r3,16
    369
    3705:	bf	cr7*4+2,6f
    371err3;	lvx	v1,0,r4
    372err3;	lvx	v0,r4,r9
    373	addi	r4,r4,32
    374err3;	stvx	v1,0,r3
    375err3;	stvx	v0,r3,r9
    376	addi	r3,r3,32
    377
    3786:	bf	cr7*4+1,7f
    379err3;	lvx	v3,0,r4
    380err3;	lvx	v2,r4,r9
    381err3;	lvx	v1,r4,r10
    382err3;	lvx	v0,r4,r11
    383	addi	r4,r4,64
    384err3;	stvx	v3,0,r3
    385err3;	stvx	v2,r3,r9
    386err3;	stvx	v1,r3,r10
    387err3;	stvx	v0,r3,r11
    388	addi	r3,r3,64
    389
    3907:	sub	r5,r5,r6
    391	srdi	r6,r5,7
    392
    393	std	r14,STK_REG(R14)(r1)
    394	std	r15,STK_REG(R15)(r1)
    395	std	r16,STK_REG(R16)(r1)
    396
    397	li	r12,64
    398	li	r14,80
    399	li	r15,96
    400	li	r16,112
    401
    402	mtctr	r6
    403
    404	/*
    405	 * Now do cacheline sized loads and stores. By this stage the
    406	 * cacheline stores are also cacheline aligned.
    407	 */
    408	.align	5
    4098:
    410err4;	lvx	v7,0,r4
    411err4;	lvx	v6,r4,r9
    412err4;	lvx	v5,r4,r10
    413err4;	lvx	v4,r4,r11
    414err4;	lvx	v3,r4,r12
    415err4;	lvx	v2,r4,r14
    416err4;	lvx	v1,r4,r15
    417err4;	lvx	v0,r4,r16
    418	addi	r4,r4,128
    419err4;	stvx	v7,0,r3
    420err4;	stvx	v6,r3,r9
    421err4;	stvx	v5,r3,r10
    422err4;	stvx	v4,r3,r11
    423err4;	stvx	v3,r3,r12
    424err4;	stvx	v2,r3,r14
    425err4;	stvx	v1,r3,r15
    426err4;	stvx	v0,r3,r16
    427	addi	r3,r3,128
    428	bdnz	8b
    429
    430	ld	r14,STK_REG(R14)(r1)
    431	ld	r15,STK_REG(R15)(r1)
    432	ld	r16,STK_REG(R16)(r1)
    433
    434	/* Up to 127B to go */
    435	clrldi	r5,r5,(64-7)
    436	srdi	r6,r5,4
    437	mtocrf	0x01,r6
    438
    439	bf	cr7*4+1,9f
    440err3;	lvx	v3,0,r4
    441err3;	lvx	v2,r4,r9
    442err3;	lvx	v1,r4,r10
    443err3;	lvx	v0,r4,r11
    444	addi	r4,r4,64
    445err3;	stvx	v3,0,r3
    446err3;	stvx	v2,r3,r9
    447err3;	stvx	v1,r3,r10
    448err3;	stvx	v0,r3,r11
    449	addi	r3,r3,64
    450
    4519:	bf	cr7*4+2,10f
    452err3;	lvx	v1,0,r4
    453err3;	lvx	v0,r4,r9
    454	addi	r4,r4,32
    455err3;	stvx	v1,0,r3
    456err3;	stvx	v0,r3,r9
    457	addi	r3,r3,32
    458
    45910:	bf	cr7*4+3,11f
    460err3;	lvx	v1,0,r4
    461	addi	r4,r4,16
    462err3;	stvx	v1,0,r3
    463	addi	r3,r3,16
    464
    465	/* Up to 15B to go */
    46611:	clrldi	r5,r5,(64-4)
    467	mtocrf	0x01,r5
    468	bf	cr7*4+0,12f
    469err3;	ld	r0,0(r4)
    470	addi	r4,r4,8
    471err3;	std	r0,0(r3)
    472	addi	r3,r3,8
    473
    47412:	bf	cr7*4+1,13f
    475err3;	lwz	r0,0(r4)
    476	addi	r4,r4,4
    477err3;	stw	r0,0(r3)
    478	addi	r3,r3,4
    479
    48013:	bf	cr7*4+2,14f
    481err3;	lhz	r0,0(r4)
    482	addi	r4,r4,2
    483err3;	sth	r0,0(r3)
    484	addi	r3,r3,2
    485
    48614:	bf	cr7*4+3,15f
    487err3;	lbz	r0,0(r4)
    488err3;	stb	r0,0(r3)
    489
    49015:	addi	r1,r1,STACKFRAMESIZE
    491	b	exit_vmx_usercopy	/* tail call optimise */
    492
    493.Lvmx_unaligned_copy:
    494	/* Get the destination 16B aligned */
    495	neg	r6,r3
    496	mtocrf	0x01,r6
    497	clrldi	r6,r6,(64-4)
    498
    499	bf	cr7*4+3,1f
    500err3;	lbz	r0,0(r4)
    501	addi	r4,r4,1
    502err3;	stb	r0,0(r3)
    503	addi	r3,r3,1
    504
    5051:	bf	cr7*4+2,2f
    506err3;	lhz	r0,0(r4)
    507	addi	r4,r4,2
    508err3;	sth	r0,0(r3)
    509	addi	r3,r3,2
    510
    5112:	bf	cr7*4+1,3f
    512err3;	lwz	r0,0(r4)
    513	addi	r4,r4,4
    514err3;	stw	r0,0(r3)
    515	addi	r3,r3,4
    516
    5173:	bf	cr7*4+0,4f
    518err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
    519err3;	lwz	r7,4(r4)
    520	addi	r4,r4,8
    521err3;	stw	r0,0(r3)
    522err3;	stw	r7,4(r3)
    523	addi	r3,r3,8
    524
    5254:	sub	r5,r5,r6
    526
    527	/* Get the desination 128B aligned */
    528	neg	r6,r3
    529	srdi	r7,r6,4
    530	mtocrf	0x01,r7
    531	clrldi	r6,r6,(64-7)
    532
    533	li	r9,16
    534	li	r10,32
    535	li	r11,48
    536
    537	LVS(v16,0,r4)		/* Setup permute control vector */
    538err3;	lvx	v0,0,r4
    539	addi	r4,r4,16
    540
    541	bf	cr7*4+3,5f
    542err3;	lvx	v1,0,r4
    543	VPERM(v8,v0,v1,v16)
    544	addi	r4,r4,16
    545err3;	stvx	v8,0,r3
    546	addi	r3,r3,16
    547	vor	v0,v1,v1
    548
    5495:	bf	cr7*4+2,6f
    550err3;	lvx	v1,0,r4
    551	VPERM(v8,v0,v1,v16)
    552err3;	lvx	v0,r4,r9
    553	VPERM(v9,v1,v0,v16)
    554	addi	r4,r4,32
    555err3;	stvx	v8,0,r3
    556err3;	stvx	v9,r3,r9
    557	addi	r3,r3,32
    558
    5596:	bf	cr7*4+1,7f
    560err3;	lvx	v3,0,r4
    561	VPERM(v8,v0,v3,v16)
    562err3;	lvx	v2,r4,r9
    563	VPERM(v9,v3,v2,v16)
    564err3;	lvx	v1,r4,r10
    565	VPERM(v10,v2,v1,v16)
    566err3;	lvx	v0,r4,r11
    567	VPERM(v11,v1,v0,v16)
    568	addi	r4,r4,64
    569err3;	stvx	v8,0,r3
    570err3;	stvx	v9,r3,r9
    571err3;	stvx	v10,r3,r10
    572err3;	stvx	v11,r3,r11
    573	addi	r3,r3,64
    574
    5757:	sub	r5,r5,r6
    576	srdi	r6,r5,7
    577
    578	std	r14,STK_REG(R14)(r1)
    579	std	r15,STK_REG(R15)(r1)
    580	std	r16,STK_REG(R16)(r1)
    581
    582	li	r12,64
    583	li	r14,80
    584	li	r15,96
    585	li	r16,112
    586
    587	mtctr	r6
    588
    589	/*
    590	 * Now do cacheline sized loads and stores. By this stage the
    591	 * cacheline stores are also cacheline aligned.
    592	 */
    593	.align	5
    5948:
    595err4;	lvx	v7,0,r4
    596	VPERM(v8,v0,v7,v16)
    597err4;	lvx	v6,r4,r9
    598	VPERM(v9,v7,v6,v16)
    599err4;	lvx	v5,r4,r10
    600	VPERM(v10,v6,v5,v16)
    601err4;	lvx	v4,r4,r11
    602	VPERM(v11,v5,v4,v16)
    603err4;	lvx	v3,r4,r12
    604	VPERM(v12,v4,v3,v16)
    605err4;	lvx	v2,r4,r14
    606	VPERM(v13,v3,v2,v16)
    607err4;	lvx	v1,r4,r15
    608	VPERM(v14,v2,v1,v16)
    609err4;	lvx	v0,r4,r16
    610	VPERM(v15,v1,v0,v16)
    611	addi	r4,r4,128
    612err4;	stvx	v8,0,r3
    613err4;	stvx	v9,r3,r9
    614err4;	stvx	v10,r3,r10
    615err4;	stvx	v11,r3,r11
    616err4;	stvx	v12,r3,r12
    617err4;	stvx	v13,r3,r14
    618err4;	stvx	v14,r3,r15
    619err4;	stvx	v15,r3,r16
    620	addi	r3,r3,128
    621	bdnz	8b
    622
    623	ld	r14,STK_REG(R14)(r1)
    624	ld	r15,STK_REG(R15)(r1)
    625	ld	r16,STK_REG(R16)(r1)
    626
    627	/* Up to 127B to go */
    628	clrldi	r5,r5,(64-7)
    629	srdi	r6,r5,4
    630	mtocrf	0x01,r6
    631
    632	bf	cr7*4+1,9f
    633err3;	lvx	v3,0,r4
    634	VPERM(v8,v0,v3,v16)
    635err3;	lvx	v2,r4,r9
    636	VPERM(v9,v3,v2,v16)
    637err3;	lvx	v1,r4,r10
    638	VPERM(v10,v2,v1,v16)
    639err3;	lvx	v0,r4,r11
    640	VPERM(v11,v1,v0,v16)
    641	addi	r4,r4,64
    642err3;	stvx	v8,0,r3
    643err3;	stvx	v9,r3,r9
    644err3;	stvx	v10,r3,r10
    645err3;	stvx	v11,r3,r11
    646	addi	r3,r3,64
    647
    6489:	bf	cr7*4+2,10f
    649err3;	lvx	v1,0,r4
    650	VPERM(v8,v0,v1,v16)
    651err3;	lvx	v0,r4,r9
    652	VPERM(v9,v1,v0,v16)
    653	addi	r4,r4,32
    654err3;	stvx	v8,0,r3
    655err3;	stvx	v9,r3,r9
    656	addi	r3,r3,32
    657
    65810:	bf	cr7*4+3,11f
    659err3;	lvx	v1,0,r4
    660	VPERM(v8,v0,v1,v16)
    661	addi	r4,r4,16
    662err3;	stvx	v8,0,r3
    663	addi	r3,r3,16
    664
    665	/* Up to 15B to go */
    66611:	clrldi	r5,r5,(64-4)
    667	addi	r4,r4,-16	/* Unwind the +16 load offset */
    668	mtocrf	0x01,r5
    669	bf	cr7*4+0,12f
    670err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
    671err3;	lwz	r6,4(r4)
    672	addi	r4,r4,8
    673err3;	stw	r0,0(r3)
    674err3;	stw	r6,4(r3)
    675	addi	r3,r3,8
    676
    67712:	bf	cr7*4+1,13f
    678err3;	lwz	r0,0(r4)
    679	addi	r4,r4,4
    680err3;	stw	r0,0(r3)
    681	addi	r3,r3,4
    682
    68313:	bf	cr7*4+2,14f
    684err3;	lhz	r0,0(r4)
    685	addi	r4,r4,2
    686err3;	sth	r0,0(r3)
    687	addi	r3,r3,2
    688
    68914:	bf	cr7*4+3,15f
    690err3;	lbz	r0,0(r4)
    691err3;	stb	r0,0(r3)
    692
    69315:	addi	r1,r1,STACKFRAMESIZE
    694	b	exit_vmx_usercopy	/* tail call optimise */
    695#endif /* CONFIG_ALTIVEC */