cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

NG2memcpy.S (15610B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/* NG2memcpy.S: Niagara-2 optimized memcpy.
      3 *
      4 * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
      5 */
      6
      7#ifdef __KERNEL__
      8#include <linux/linkage.h>
      9#include <asm/visasm.h>
     10#include <asm/asi.h>
     11#define GLOBAL_SPARE	%g7
     12#else
     13#define ASI_PNF 0x82
     14#define ASI_BLK_P 0xf0
     15#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
     16#define FPRS_FEF  0x04
     17#ifdef MEMCPY_DEBUG
     18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
     19		     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
     20#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
     21#else
     22#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
     23#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
     24#endif
     25#define GLOBAL_SPARE	%g5
     26#endif
     27
     28#ifndef STORE_ASI
     29#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
     30#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
     31#else
     32#define STORE_ASI	0x80		/* ASI_P */
     33#endif
     34#endif
     35
     36#ifndef EX_LD
     37#define EX_LD(x,y)	x
     38#endif
     39#ifndef EX_LD_FP
     40#define EX_LD_FP(x,y)	x
     41#endif
     42
     43#ifndef EX_ST
     44#define EX_ST(x,y)	x
     45#endif
     46#ifndef EX_ST_FP
     47#define EX_ST_FP(x,y)	x
     48#endif
     49
     50#ifndef LOAD
     51#define LOAD(type,addr,dest)	type [addr], dest
     52#endif
     53
     54#ifndef LOAD_BLK
     55#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
     56#endif
     57
     58#ifndef STORE
     59#ifndef MEMCPY_DEBUG
     60#define STORE(type,src,addr)	type src, [addr]
     61#else
     62#define STORE(type,src,addr)	type##a src, [addr] 0x80
     63#endif
     64#endif
     65
     66#ifndef STORE_BLK
     67#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
     68#endif
     69
     70#ifndef STORE_INIT
     71#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
     72#endif
     73
     74#ifndef FUNC_NAME
     75#define FUNC_NAME	NG2memcpy
     76#endif
     77
     78#ifndef PREAMBLE
     79#define PREAMBLE
     80#endif
     81
     82#ifndef XCC
     83#define XCC xcc
     84#endif
     85
     86#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
     87	faligndata	%x0, %x1, %f0; \
     88	faligndata	%x1, %x2, %f2; \
     89	faligndata	%x2, %x3, %f4; \
     90	faligndata	%x3, %x4, %f6; \
     91	faligndata	%x4, %x5, %f8; \
     92	faligndata	%x5, %x6, %f10; \
     93	faligndata	%x6, %x7, %f12; \
     94	faligndata	%x7, %x8, %f14;
     95
     96#define FREG_MOVE_1(x0) \
     97	fsrc2		%x0, %f0;
     98#define FREG_MOVE_2(x0, x1) \
     99	fsrc2		%x0, %f0; \
    100	fsrc2		%x1, %f2;
    101#define FREG_MOVE_3(x0, x1, x2) \
    102	fsrc2		%x0, %f0; \
    103	fsrc2		%x1, %f2; \
    104	fsrc2		%x2, %f4;
    105#define FREG_MOVE_4(x0, x1, x2, x3) \
    106	fsrc2		%x0, %f0; \
    107	fsrc2		%x1, %f2; \
    108	fsrc2		%x2, %f4; \
    109	fsrc2		%x3, %f6;
    110#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
    111	fsrc2		%x0, %f0; \
    112	fsrc2		%x1, %f2; \
    113	fsrc2		%x2, %f4; \
    114	fsrc2		%x3, %f6; \
    115	fsrc2		%x4, %f8;
    116#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
    117	fsrc2		%x0, %f0; \
    118	fsrc2		%x1, %f2; \
    119	fsrc2		%x2, %f4; \
    120	fsrc2		%x3, %f6; \
    121	fsrc2		%x4, %f8; \
    122	fsrc2		%x5, %f10;
    123#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
    124	fsrc2		%x0, %f0; \
    125	fsrc2		%x1, %f2; \
    126	fsrc2		%x2, %f4; \
    127	fsrc2		%x3, %f6; \
    128	fsrc2		%x4, %f8; \
    129	fsrc2		%x5, %f10; \
    130	fsrc2		%x6, %f12;
    131#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
    132	fsrc2		%x0, %f0; \
    133	fsrc2		%x1, %f2; \
    134	fsrc2		%x2, %f4; \
    135	fsrc2		%x3, %f6; \
    136	fsrc2		%x4, %f8; \
    137	fsrc2		%x5, %f10; \
    138	fsrc2		%x6, %f12; \
    139	fsrc2		%x7, %f14;
    140#define FREG_LOAD_1(base, x0) \
    141	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
    142#define FREG_LOAD_2(base, x0, x1) \
    143	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
    144	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
    145#define FREG_LOAD_3(base, x0, x1, x2) \
    146	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
    147	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
    148	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
    149#define FREG_LOAD_4(base, x0, x1, x2, x3) \
    150	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
    151	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
    152	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
    153	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
    154#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
    155	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
    156	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
    157	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
    158	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
    159	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
    160#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
    161	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
    162	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
    163	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
    164	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
    165	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
    166	EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
    167#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
    168	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
    169	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
    170	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
    171	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
    172	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
    173	EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
    174	EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
    175
    176	.register	%g2,#scratch
    177	.register	%g3,#scratch
    178
    179	.text
    180#ifndef EX_RETVAL
    181#define EX_RETVAL(x)	x
    182__restore_fp:
    183	VISExitHalf
    184__restore_asi:
    185	retl
    186	 wr	%g0, ASI_AIUS, %asi
    187ENTRY(NG2_retl_o2)
    188	ba,pt	%xcc, __restore_asi
    189	 mov	%o2, %o0
    190ENDPROC(NG2_retl_o2)
    191ENTRY(NG2_retl_o2_plus_1)
    192	ba,pt	%xcc, __restore_asi
    193	 add	%o2, 1, %o0
    194ENDPROC(NG2_retl_o2_plus_1)
    195ENTRY(NG2_retl_o2_plus_4)
    196	ba,pt	%xcc, __restore_asi
    197	 add	%o2, 4, %o0
    198ENDPROC(NG2_retl_o2_plus_4)
    199ENTRY(NG2_retl_o2_plus_8)
    200	ba,pt	%xcc, __restore_asi
    201	 add	%o2, 8, %o0
    202ENDPROC(NG2_retl_o2_plus_8)
    203ENTRY(NG2_retl_o2_plus_o4_plus_1)
    204	add	%o4, 1, %o4
    205	ba,pt	%xcc, __restore_asi
    206	 add	%o2, %o4, %o0
    207ENDPROC(NG2_retl_o2_plus_o4_plus_1)
    208ENTRY(NG2_retl_o2_plus_o4_plus_8)
    209	add	%o4, 8, %o4
    210	ba,pt	%xcc, __restore_asi
    211	 add	%o2, %o4, %o0
    212ENDPROC(NG2_retl_o2_plus_o4_plus_8)
    213ENTRY(NG2_retl_o2_plus_o4_plus_16)
    214	add	%o4, 16, %o4
    215	ba,pt	%xcc, __restore_asi
    216	 add	%o2, %o4, %o0
    217ENDPROC(NG2_retl_o2_plus_o4_plus_16)
    218ENTRY(NG2_retl_o2_plus_g1_fp)
    219	ba,pt	%xcc, __restore_fp
    220	 add	%o2, %g1, %o0
    221ENDPROC(NG2_retl_o2_plus_g1_fp)
    222ENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
    223	add	%g1, 64, %g1
    224	ba,pt	%xcc, __restore_fp
    225	 add	%o2, %g1, %o0
    226ENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
    227ENTRY(NG2_retl_o2_plus_g1_plus_1)
    228	add	%g1, 1, %g1
    229	ba,pt	%xcc, __restore_asi
    230	 add	%o2, %g1, %o0
    231ENDPROC(NG2_retl_o2_plus_g1_plus_1)
    232ENTRY(NG2_retl_o2_and_7_plus_o4)
    233	and	%o2, 7, %o2
    234	ba,pt	%xcc, __restore_asi
    235	 add	%o2, %o4, %o0
    236ENDPROC(NG2_retl_o2_and_7_plus_o4)
    237ENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
    238	and	%o2, 7, %o2
    239	add	%o4, 8, %o4
    240	ba,pt	%xcc, __restore_asi
    241	 add	%o2, %o4, %o0
    242ENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
    243#endif
    244
    245	.align		64
    246
    247	.globl	FUNC_NAME
    248	.type	FUNC_NAME,#function
    249FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
    250	srlx		%o2, 31, %g2
    251	cmp		%g2, 0
    252	tne		%xcc, 5
    253	PREAMBLE
    254	mov		%o0, %o3
    255	cmp		%o2, 0
    256	be,pn		%XCC, 85f
    257	 or		%o0, %o1, GLOBAL_SPARE
    258	cmp		%o2, 16
    259	blu,a,pn	%XCC, 80f
    260	 or		GLOBAL_SPARE, %o2, GLOBAL_SPARE
    261
    262	/* 2 blocks (128 bytes) is the minimum we can do the block
    263	 * copy with.  We need to ensure that we'll iterate at least
    264	 * once in the block copy loop.  At worst we'll need to align
    265	 * the destination to a 64-byte boundary which can chew up
    266	 * to (64 - 1) bytes from the length before we perform the
    267	 * block copy loop.
    268	 *
    269	 * However, the cut-off point, performance wise, is around
    270	 * 4 64-byte blocks.
    271	 */
    272	cmp		%o2, (4 * 64)
    273	blu,pt		%XCC, 75f
    274	 andcc		GLOBAL_SPARE, 0x7, %g0
    275
    276	/* %o0:	dst
    277	 * %o1:	src
    278	 * %o2:	len  (known to be >= 128)
    279	 *
    280	 * The block copy loops can use %o4, %g2, %g3 as
    281	 * temporaries while copying the data.  %o5 must
    282	 * be preserved between VISEntryHalf and VISExitHalf
    283	 */
    284
    285	LOAD(prefetch, %o1 + 0x000, #one_read)
    286	LOAD(prefetch, %o1 + 0x040, #one_read)
    287	LOAD(prefetch, %o1 + 0x080, #one_read)
    288
    289	/* Align destination on 64-byte boundary.  */
    290	andcc		%o0, (64 - 1), %o4
    291	be,pt		%XCC, 2f
    292	 sub		%o4, 64, %o4
    293	sub		%g0, %o4, %o4	! bytes to align dst
    294	sub		%o2, %o4, %o2
    2951:	subcc		%o4, 1, %o4
    296	EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
    297	EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
    298	add		%o1, 1, %o1
    299	bne,pt		%XCC, 1b
    300	add		%o0, 1, %o0
    301
    3022:
    303	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
    304	 * o5 from here until we hit VISExitHalf.
    305	 */
    306	VISEntryHalf
    307
    308	membar		#Sync
    309	alignaddr	%o1, %g0, %g0
    310
    311	add		%o1, (64 - 1), %o4
    312	andn		%o4, (64 - 1), %o4
    313	andn		%o2, (64 - 1), %g1
    314	sub		%o2, %g1, %o2
    315
    316	and		%o1, (64 - 1), %g2
    317	add		%o1, %g1, %o1
    318	sub		%o0, %o4, %g3
    319	brz,pt		%g2, 190f
    320	 cmp		%g2, 32
    321	blu,a		5f
    322	 cmp		%g2, 16
    323	cmp		%g2, 48
    324	blu,a		4f
    325	 cmp		%g2, 40
    326	cmp		%g2, 56
    327	blu		170f
    328	 nop
    329	ba,a,pt		%xcc, 180f
    330	 nop
    331
    3324:	/* 32 <= low bits < 48 */
    333	blu		150f
    334	 nop
    335	ba,a,pt		%xcc, 160f
    336	 nop
    3375:	/* 0 < low bits < 32 */
    338	blu,a		6f
    339	 cmp		%g2, 8
    340	cmp		%g2, 24
    341	blu		130f
    342	 nop
    343	ba,a,pt		%xcc, 140f
    344	 nop
    3456:	/* 0 < low bits < 16 */
    346	bgeu		120f
    347	 nop
    348	/* fall through for 0 < low bits < 8 */
    349110:	sub		%o4, 64, %g2
    350	EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
    3511:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    352	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    353	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
    354	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    355	FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
    356	subcc		%g1, 64, %g1
    357	add		%o4, 64, %o4
    358	bne,pt		%xcc, 1b
    359	 LOAD(prefetch, %o4 + 64, #one_read)
    360	ba,pt		%xcc, 195f
    361	 nop
    362
    363120:	sub		%o4, 56, %g2
    364	FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
    3651:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    366	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    367	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
    368	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    369	FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
    370	subcc		%g1, 64, %g1
    371	add		%o4, 64, %o4
    372	bne,pt		%xcc, 1b
    373	 LOAD(prefetch, %o4 + 64, #one_read)
    374	ba,pt		%xcc, 195f
    375	 nop
    376
    377130:	sub		%o4, 48, %g2
    378	FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
    3791:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    380	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    381	FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
    382	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    383	FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
    384	subcc		%g1, 64, %g1
    385	add		%o4, 64, %o4
    386	bne,pt		%xcc, 1b
    387	 LOAD(prefetch, %o4 + 64, #one_read)
    388	ba,pt		%xcc, 195f
    389	 nop
    390
    391140:	sub		%o4, 40, %g2
    392	FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
    3931:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    394	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    395	FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
    396	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    397	FREG_MOVE_5(f22, f24, f26, f28, f30)
    398	subcc		%g1, 64, %g1
    399	add		%o4, 64, %o4
    400	bne,pt		%xcc, 1b
    401	 LOAD(prefetch, %o4 + 64, #one_read)
    402	ba,pt		%xcc, 195f
    403	 nop
    404
    405150:	sub		%o4, 32, %g2
    406	FREG_LOAD_4(%g2, f0, f2, f4, f6)
    4071:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    408	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    409	FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
    410	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    411	FREG_MOVE_4(f24, f26, f28, f30)
    412	subcc		%g1, 64, %g1
    413	add		%o4, 64, %o4
    414	bne,pt		%xcc, 1b
    415	 LOAD(prefetch, %o4 + 64, #one_read)
    416	ba,pt		%xcc, 195f
    417	 nop
    418
    419160:	sub		%o4, 24, %g2
    420	FREG_LOAD_3(%g2, f0, f2, f4)
    4211:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    422	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    423	FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
    424	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    425	FREG_MOVE_3(f26, f28, f30)
    426	subcc		%g1, 64, %g1
    427	add		%o4, 64, %o4
    428	bne,pt		%xcc, 1b
    429	 LOAD(prefetch, %o4 + 64, #one_read)
    430	ba,pt		%xcc, 195f
    431	 nop
    432
    433170:	sub		%o4, 16, %g2
    434	FREG_LOAD_2(%g2, f0, f2)
    4351:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    436	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    437	FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
    438	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    439	FREG_MOVE_2(f28, f30)
    440	subcc		%g1, 64, %g1
    441	add		%o4, 64, %o4
    442	bne,pt		%xcc, 1b
    443	 LOAD(prefetch, %o4 + 64, #one_read)
    444	ba,pt		%xcc, 195f
    445	 nop
    446
    447180:	sub		%o4, 8, %g2
    448	FREG_LOAD_1(%g2, f0)
    4491:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    450	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
    451	FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
    452	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
    453	FREG_MOVE_1(f30)
    454	subcc		%g1, 64, %g1
    455	add		%o4, 64, %o4
    456	bne,pt		%xcc, 1b
    457	 LOAD(prefetch, %o4 + 64, #one_read)
    458	ba,pt		%xcc, 195f
    459	 nop
    460
    461190:
    4621:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
    463	subcc		%g1, 64, %g1
    464	EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
    465	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
    466	add		%o4, 64, %o4
    467	bne,pt		%xcc, 1b
    468	 LOAD(prefetch, %o4 + 64, #one_read)
    469
    470195:
    471	add		%o4, %g3, %o0
    472	membar		#Sync
    473
    474	VISExitHalf
    475
    476	/* %o2 contains any final bytes still needed to be copied
    477	 * over. If anything is left, we copy it one byte at a time.
    478	 */
    479	brz,pt		%o2, 85f
    480	 sub		%o0, %o1, GLOBAL_SPARE
    481	ba,a,pt		%XCC, 90f
    482	 nop
    483
    484	.align		64
    48575: /* 16 < len <= 64 */
    486	bne,pn		%XCC, 75f
    487	 sub		%o0, %o1, GLOBAL_SPARE
    488
    48972:
    490	andn		%o2, 0xf, %o4
    491	and		%o2, 0xf, %o2
    4921:	subcc		%o4, 0x10, %o4
    493	EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
    494	add		%o1, 0x08, %o1
    495	EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
    496	sub		%o1, 0x08, %o1
    497	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
    498	add		%o1, 0x8, %o1
    499	EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
    500	bgu,pt		%XCC, 1b
    501	 add		%o1, 0x8, %o1
    50273:	andcc		%o2, 0x8, %g0
    503	be,pt		%XCC, 1f
    504	 nop
    505	sub		%o2, 0x8, %o2
    506	EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
    507	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
    508	add		%o1, 0x8, %o1
    5091:	andcc		%o2, 0x4, %g0
    510	be,pt		%XCC, 1f
    511	 nop
    512	sub		%o2, 0x4, %o2
    513	EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
    514	EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
    515	add		%o1, 0x4, %o1
    5161:	cmp		%o2, 0
    517	be,pt		%XCC, 85f
    518	 nop
    519	ba,pt		%xcc, 90f
    520	 nop
    521
    52275:
    523	andcc		%o0, 0x7, %g1
    524	sub		%g1, 0x8, %g1
    525	be,pn		%icc, 2f
    526	 sub		%g0, %g1, %g1
    527	sub		%o2, %g1, %o2
    528
    5291:	subcc		%g1, 1, %g1
    530	EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
    531	EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
    532	bgu,pt		%icc, 1b
    533	 add		%o1, 1, %o1
    534
    5352:	add		%o1, GLOBAL_SPARE, %o0
    536	andcc		%o1, 0x7, %g1
    537	bne,pt		%icc, 8f
    538	 sll		%g1, 3, %g1
    539
    540	cmp		%o2, 16
    541	bgeu,pt		%icc, 72b
    542	 nop
    543	ba,a,pt		%xcc, 73b
    544
    5458:	mov		64, GLOBAL_SPARE
    546	andn		%o1, 0x7, %o1
    547	EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
    548	sub		GLOBAL_SPARE, %g1, GLOBAL_SPARE
    549	andn		%o2, 0x7, %o4
    550	sllx		%g2, %g1, %g2
    5511:	add		%o1, 0x8, %o1
    552	EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
    553	subcc		%o4, 0x8, %o4
    554	srlx		%g3, GLOBAL_SPARE, %o5
    555	or		%o5, %g2, %o5
    556	EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
    557	add		%o0, 0x8, %o0
    558	bgu,pt		%icc, 1b
    559	 sllx		%g3, %g1, %g2
    560
    561	srl		%g1, 3, %g1
    562	andcc		%o2, 0x7, %o2
    563	be,pn		%icc, 85f
    564	 add		%o1, %g1, %o1
    565	ba,pt		%xcc, 90f
    566	 sub		%o0, %o1, GLOBAL_SPARE
    567
    568	.align		64
    56980: /* 0 < len <= 16 */
    570	andcc		GLOBAL_SPARE, 0x3, %g0
    571	bne,pn		%XCC, 90f
    572	 sub		%o0, %o1, GLOBAL_SPARE
    573
    5741:
    575	subcc		%o2, 4, %o2
    576	EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
    577	EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
    578	bgu,pt		%XCC, 1b
    579	 add		%o1, 4, %o1
    580
    58185:	retl
    582	 mov		EX_RETVAL(%o3), %o0
    583
    584	.align		32
    58590:
    586	subcc		%o2, 1, %o2
    587	EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
    588	EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
    589	bgu,pt		%XCC, 90b
    590	 add		%o1, 1, %o1
    591	retl
    592	 mov		EX_RETVAL(%o3), %o0
    593
    594	.size		FUNC_NAME, .-FUNC_NAME