cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

U1memcpy.S (17018B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
      3 *
      4 * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com)
      5 * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
      6 */
      7
      8#ifdef __KERNEL__
      9#include <linux/linkage.h>
     10#include <asm/visasm.h>
     11#include <asm/asi.h>
     12#include <asm/export.h>
     13#define GLOBAL_SPARE	g7
     14#else
     15#define GLOBAL_SPARE	g5
     16#define ASI_BLK_P 0xf0
     17#define FPRS_FEF  0x04
     18#ifdef MEMCPY_DEBUG
     19#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
     20		 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
     21#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
     22#else
     23#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
     24#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
     25#endif
     26#endif
     27
     28#ifndef EX_LD
     29#define EX_LD(x,y)	x
     30#endif
     31#ifndef EX_LD_FP
     32#define EX_LD_FP(x,y)	x
     33#endif
     34
     35#ifndef EX_ST
     36#define EX_ST(x,y)	x
     37#endif
     38#ifndef EX_ST_FP
     39#define EX_ST_FP(x,y)	x
     40#endif
     41
     42#ifndef LOAD
     43#define LOAD(type,addr,dest)	type [addr], dest
     44#endif
     45
     46#ifndef LOAD_BLK
     47#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
     48#endif
     49
     50#ifndef STORE
     51#define STORE(type,src,addr)	type src, [addr]
     52#endif
     53
     54#ifndef STORE_BLK
     55#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
     56#endif
     57
     58#ifndef FUNC_NAME
     59#define FUNC_NAME	memcpy
     60#endif
     61
     62#ifndef PREAMBLE
     63#define PREAMBLE
     64#endif
     65
     66#ifndef XCC
     67#define XCC xcc
     68#endif
     69
     70#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9)		\
     71	faligndata		%f1, %f2, %f48;			\
     72	faligndata		%f2, %f3, %f50;			\
     73	faligndata		%f3, %f4, %f52;			\
     74	faligndata		%f4, %f5, %f54;			\
     75	faligndata		%f5, %f6, %f56;			\
     76	faligndata		%f6, %f7, %f58;			\
     77	faligndata		%f7, %f8, %f60;			\
     78	faligndata		%f8, %f9, %f62;
     79
     80#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, jmptgt)			\
     81	EX_LD_FP(LOAD_BLK(%src, %fdest), U1_gs_80_fp);			\
     82	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp);			\
     83	add			%src, 0x40, %src;			\
     84	subcc			%GLOBAL_SPARE, 0x40, %GLOBAL_SPARE;	\
     85	be,pn			%xcc, jmptgt;				\
     86	 add			%dest, 0x40, %dest;			\
     87
     88#define LOOP_CHUNK1(src, dest, branch_dest)		\
     89	MAIN_LOOP_CHUNK(src, dest, f0,  f48, branch_dest)
     90#define LOOP_CHUNK2(src, dest, branch_dest)		\
     91	MAIN_LOOP_CHUNK(src, dest, f16, f48, branch_dest)
     92#define LOOP_CHUNK3(src, dest, branch_dest)		\
     93	MAIN_LOOP_CHUNK(src, dest, f32, f48, branch_dest)
     94
     95#define DO_SYNC			membar	#Sync;
     96#define STORE_SYNC(dest, fsrc)				\
     97	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp);	\
     98	add			%dest, 0x40, %dest;	\
     99	DO_SYNC
    100
    101#define STORE_JUMP(dest, fsrc, target)			\
    102	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_40_fp);	\
    103	add			%dest, 0x40, %dest;	\
    104	ba,pt			%xcc, target;		\
    105	 nop;
    106
    107#define FINISH_VISCHUNK(dest, f0, f1)			\
    108	subcc			%g3, 8, %g3;		\
    109	bl,pn			%xcc, 95f;		\
    110	 faligndata		%f0, %f1, %f48;		\
    111	EX_ST_FP(STORE(std, %f48, %dest), U1_g3_8_fp);	\
    112	add			%dest, 8, %dest;
    113
    114#define UNEVEN_VISCHUNK_LAST(dest, f0, f1)	\
    115	subcc			%g3, 8, %g3;	\
    116	bl,pn			%xcc, 95f;	\
    117	 fsrc2			%f0, %f1;
    118
    119#define UNEVEN_VISCHUNK(dest, f0, f1)		\
    120	UNEVEN_VISCHUNK_LAST(dest, f0, f1)	\
    121	ba,a,pt			%xcc, 93f;
    122
    123	.register	%g2,#scratch
    124	.register	%g3,#scratch
    125
    126	.text
    127#ifndef EX_RETVAL
    128#define EX_RETVAL(x)	x
    129ENTRY(U1_g1_1_fp)
    130	VISExitHalf
    131	add		%g1, 1, %g1
    132	add		%g1, %g2, %g1
    133	retl
    134	 add		%g1, %o2, %o0
    135ENDPROC(U1_g1_1_fp)
    136ENTRY(U1_g2_0_fp)
    137	VISExitHalf
    138	retl
    139	 add		%g2, %o2, %o0
    140ENDPROC(U1_g2_0_fp)
    141ENTRY(U1_g2_8_fp)
    142	VISExitHalf
    143	add		%g2, 8, %g2
    144	retl
    145	 add		%g2, %o2, %o0
    146ENDPROC(U1_g2_8_fp)
    147ENTRY(U1_gs_0_fp)
    148	VISExitHalf
    149	add		%GLOBAL_SPARE, %g3, %o0
    150	retl
    151	 add		%o0, %o2, %o0
    152ENDPROC(U1_gs_0_fp)
    153ENTRY(U1_gs_80_fp)
    154	VISExitHalf
    155	add		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
    156	add		%GLOBAL_SPARE, %g3, %o0
    157	retl
    158	 add		%o0, %o2, %o0
    159ENDPROC(U1_gs_80_fp)
    160ENTRY(U1_gs_40_fp)
    161	VISExitHalf
    162	add		%GLOBAL_SPARE, 0x40, %GLOBAL_SPARE
    163	add		%GLOBAL_SPARE, %g3, %o0
    164	retl
    165	 add		%o0, %o2, %o0
    166ENDPROC(U1_gs_40_fp)
    167ENTRY(U1_g3_0_fp)
    168	VISExitHalf
    169	retl
    170	 add		%g3, %o2, %o0
    171ENDPROC(U1_g3_0_fp)
    172ENTRY(U1_g3_8_fp)
    173	VISExitHalf
    174	add		%g3, 8, %g3
    175	retl
    176	 add		%g3, %o2, %o0
    177ENDPROC(U1_g3_8_fp)
    178ENTRY(U1_o2_0_fp)
    179	VISExitHalf
    180	retl
    181	 mov		%o2, %o0
    182ENDPROC(U1_o2_0_fp)
    183ENTRY(U1_o2_1_fp)
    184	VISExitHalf
    185	retl
    186	 add		%o2, 1, %o0
    187ENDPROC(U1_o2_1_fp)
    188ENTRY(U1_gs_0)
    189	VISExitHalf
    190	retl
    191	 add		%GLOBAL_SPARE, %o2, %o0
    192ENDPROC(U1_gs_0)
    193ENTRY(U1_gs_8)
    194	VISExitHalf
    195	add		%GLOBAL_SPARE, %o2, %GLOBAL_SPARE
    196	retl
    197	 add		%GLOBAL_SPARE, 0x8, %o0
    198ENDPROC(U1_gs_8)
    199ENTRY(U1_gs_10)
    200	VISExitHalf
    201	add		%GLOBAL_SPARE, %o2, %GLOBAL_SPARE
    202	retl
    203	 add		%GLOBAL_SPARE, 0x10, %o0
    204ENDPROC(U1_gs_10)
    205ENTRY(U1_o2_0)
    206	retl
    207	 mov		%o2, %o0
    208ENDPROC(U1_o2_0)
    209ENTRY(U1_o2_8)
    210	retl
    211	 add		%o2, 8, %o0
    212ENDPROC(U1_o2_8)
    213ENTRY(U1_o2_4)
    214	retl
    215	 add		%o2, 4, %o0
    216ENDPROC(U1_o2_4)
    217ENTRY(U1_o2_1)
    218	retl
    219	 add		%o2, 1, %o0
    220ENDPROC(U1_o2_1)
    221ENTRY(U1_g1_0)
    222	retl
    223	 add		%g1, %o2, %o0
    224ENDPROC(U1_g1_0)
    225ENTRY(U1_g1_1)
    226	add		%g1, 1, %g1
    227	retl
    228	 add		%g1, %o2, %o0
    229ENDPROC(U1_g1_1)
    230ENTRY(U1_gs_0_o2_adj)
    231	and		%o2, 7, %o2
    232	retl
    233	 add		%GLOBAL_SPARE, %o2, %o0
    234ENDPROC(U1_gs_0_o2_adj)
    235ENTRY(U1_gs_8_o2_adj)
    236	and		%o2, 7, %o2
    237	add		%GLOBAL_SPARE, 8, %GLOBAL_SPARE
    238	retl
    239	 add		%GLOBAL_SPARE, %o2, %o0
    240ENDPROC(U1_gs_8_o2_adj)
    241#endif
    242
    243	.align		64
    244
    245	.globl		FUNC_NAME
    246	.type		FUNC_NAME,#function
    247FUNC_NAME:		/* %o0=dst, %o1=src, %o2=len */
    248	srlx		%o2, 31, %g2
    249	cmp		%g2, 0
    250	tne		%xcc, 5
    251	PREAMBLE
    252	mov		%o0, %o4
    253	cmp		%o2, 0
    254	be,pn		%XCC, 85f
    255	 or		%o0, %o1, %o3
    256	cmp		%o2, 16
    257	blu,a,pn	%XCC, 80f
    258	 or		%o3, %o2, %o3
    259
    260	cmp		%o2, (5 * 64)
    261	blu,pt		%XCC, 70f
    262	 andcc		%o3, 0x7, %g0
    263
    264	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  */
    265	VISEntry
    266
    267	/* Is 'dst' already aligned on an 64-byte boundary? */
    268	andcc		%o0, 0x3f, %g2
    269	be,pt		%XCC, 2f
    270
    271	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
    272	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
    273	 * subtract this from 'len'.
    274	 */
    275	 sub		%o0, %o1, %GLOBAL_SPARE
    276	sub		%g2, 0x40, %g2
    277	sub		%g0, %g2, %g2
    278	sub		%o2, %g2, %o2
    279	andcc		%g2, 0x7, %g1
    280	be,pt		%icc, 2f
    281	 and		%g2, 0x38, %g2
    282
    2831:	subcc		%g1, 0x1, %g1
    284	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U1_g1_1_fp)
    285	EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE), U1_g1_1_fp)
    286	bgu,pt		%XCC, 1b
    287	 add		%o1, 0x1, %o1
    288
    289	add		%o1, %GLOBAL_SPARE, %o0
    290
    2912:	cmp		%g2, 0x0
    292	and		%o1, 0x7, %g1
    293	be,pt		%icc, 3f
    294	 alignaddr	%o1, %g0, %o1
    295
    296	EX_LD_FP(LOAD(ldd, %o1, %f4), U1_g2_0_fp)
    2971:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U1_g2_0_fp)
    298	add		%o1, 0x8, %o1
    299	subcc		%g2, 0x8, %g2
    300	faligndata	%f4, %f6, %f0
    301	EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
    302	be,pn		%icc, 3f
    303	 add		%o0, 0x8, %o0
    304
    305	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U1_g2_0_fp)
    306	add		%o1, 0x8, %o1
    307	subcc		%g2, 0x8, %g2
    308	faligndata	%f6, %f4, %f0
    309	EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
    310	bne,pt		%icc, 1b
    311	 add		%o0, 0x8, %o0
    312
    313	/* Destination is 64-byte aligned.  */
    3143:	
    315	membar		  #LoadStore | #StoreStore | #StoreLoad
    316
    317	subcc		%o2, 0x40, %GLOBAL_SPARE
    318	add		%o1, %g1, %g1
    319	andncc		%GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
    320	srl		%g1, 3, %g2
    321	sub		%o2, %GLOBAL_SPARE, %g3
    322	andn		%o1, (0x40 - 1), %o1
    323	and		%g2, 7, %g2
    324	andncc		%g3, 0x7, %g3
    325	fsrc2		%f0, %f2
    326	sub		%g3, 0x8, %g3
    327	sub		%o2, %GLOBAL_SPARE, %o2
    328
    329	add		%g1, %GLOBAL_SPARE, %g1
    330	subcc		%o2, %g3, %o2
    331
    332	EX_LD_FP(LOAD_BLK(%o1, %f0), U1_gs_0_fp)
    333	add		%o1, 0x40, %o1
    334	add		%g1, %g3, %g1
    335	EX_LD_FP(LOAD_BLK(%o1, %f16), U1_gs_0_fp)
    336	add		%o1, 0x40, %o1
    337	sub		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
    338	EX_LD_FP(LOAD_BLK(%o1, %f32), U1_gs_80_fp)
    339	add		%o1, 0x40, %o1
    340
    341	/* There are 8 instances of the unrolled loop,
    342	 * one for each possible alignment of the
    343	 * source buffer.  Each loop instance is 452
    344	 * bytes.
    345	 */
    346	sll		%g2, 3, %o3
    347	sub		%o3, %g2, %o3
    348	sllx		%o3, 4, %o3
    349	add		%o3, %g2, %o3
    350	sllx		%o3, 2, %g2
    3511:	rd		%pc, %o3
    352	add		%o3, %lo(1f - 1b), %o3
    353	jmpl		%o3 + %g2, %g0
    354	 nop
    355
    356	.align		64
    3571:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
    358	LOOP_CHUNK1(o1, o0, 1f)
    359	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
    360	LOOP_CHUNK2(o1, o0, 2f)
    361	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
    362	LOOP_CHUNK3(o1, o0, 3f)
    363	ba,pt		%xcc, 1b+4
    364	 faligndata	%f0, %f2, %f48
    3651:	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
    366	STORE_SYNC(o0, f48)
    367	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
    368	STORE_JUMP(o0, f48, 40f)
    3692:	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
    370	STORE_SYNC(o0, f48)
    371	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
    372	STORE_JUMP(o0, f48, 48f)
    3733:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
    374	STORE_SYNC(o0, f48)
    375	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
    376	STORE_JUMP(o0, f48, 56f)
    377
    3781:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
    379	LOOP_CHUNK1(o1, o0, 1f)
    380	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
    381	LOOP_CHUNK2(o1, o0, 2f)
    382	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
    383	LOOP_CHUNK3(o1, o0, 3f)
    384	ba,pt		%xcc, 1b+4
    385	 faligndata	%f2, %f4, %f48
    3861:	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
    387	STORE_SYNC(o0, f48)
    388	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
    389	STORE_JUMP(o0, f48, 41f)
    3902:	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
    391	STORE_SYNC(o0, f48)
    392	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
    393	STORE_JUMP(o0, f48, 49f)
    3943:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
    395	STORE_SYNC(o0, f48)
    396	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
    397	STORE_JUMP(o0, f48, 57f)
    398
    3991:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
    400	LOOP_CHUNK1(o1, o0, 1f)
    401	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
    402	LOOP_CHUNK2(o1, o0, 2f)
    403	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
    404	LOOP_CHUNK3(o1, o0, 3f)
    405	ba,pt		%xcc, 1b+4
    406	 faligndata	%f4, %f6, %f48
    4071:	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
    408	STORE_SYNC(o0, f48)
    409	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
    410	STORE_JUMP(o0, f48, 42f)
    4112:	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
    412	STORE_SYNC(o0, f48)
    413	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
    414	STORE_JUMP(o0, f48, 50f)
    4153:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
    416	STORE_SYNC(o0, f48)
    417	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
    418	STORE_JUMP(o0, f48, 58f)
    419
    4201:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
    421	LOOP_CHUNK1(o1, o0, 1f)
    422	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
    423	LOOP_CHUNK2(o1, o0, 2f)
    424	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 
    425	LOOP_CHUNK3(o1, o0, 3f)
    426	ba,pt		%xcc, 1b+4
    427	 faligndata	%f6, %f8, %f48
    4281:	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
    429	STORE_SYNC(o0, f48)
    430	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
    431	STORE_JUMP(o0, f48, 43f)
    4322:	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
    433	STORE_SYNC(o0, f48)
    434	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
    435	STORE_JUMP(o0, f48, 51f)
    4363:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
    437	STORE_SYNC(o0, f48)
    438	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
    439	STORE_JUMP(o0, f48, 59f)
    440
    4411:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
    442	LOOP_CHUNK1(o1, o0, 1f)
    443	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
    444	LOOP_CHUNK2(o1, o0, 2f)
    445	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
    446	LOOP_CHUNK3(o1, o0, 3f)
    447	ba,pt		%xcc, 1b+4
    448	 faligndata	%f8, %f10, %f48
    4491:	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
    450	STORE_SYNC(o0, f48)
    451	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
    452	STORE_JUMP(o0, f48, 44f)
    4532:	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
    454	STORE_SYNC(o0, f48)
    455	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
    456	STORE_JUMP(o0, f48, 52f)
    4573:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
    458	STORE_SYNC(o0, f48)
    459	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
    460	STORE_JUMP(o0, f48, 60f)
    461
    4621:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
    463	LOOP_CHUNK1(o1, o0, 1f)
    464	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
    465	LOOP_CHUNK2(o1, o0, 2f)
    466	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
    467	LOOP_CHUNK3(o1, o0, 3f)
    468	ba,pt		%xcc, 1b+4
    469	 faligndata	%f10, %f12, %f48
    4701:	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
    471	STORE_SYNC(o0, f48)
    472	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
    473	STORE_JUMP(o0, f48, 45f)
    4742:	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
    475	STORE_SYNC(o0, f48)
    476	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
    477	STORE_JUMP(o0, f48, 53f)
    4783:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
    479	STORE_SYNC(o0, f48)
    480	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
    481	STORE_JUMP(o0, f48, 61f)
    482
    4831:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
    484	LOOP_CHUNK1(o1, o0, 1f)
    485	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
    486	LOOP_CHUNK2(o1, o0, 2f)
    487	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
    488	LOOP_CHUNK3(o1, o0, 3f)
    489	ba,pt		%xcc, 1b+4
    490	 faligndata	%f12, %f14, %f48
    4911:	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
    492	STORE_SYNC(o0, f48)
    493	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
    494	STORE_JUMP(o0, f48, 46f)
    4952:	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
    496	STORE_SYNC(o0, f48)
    497	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
    498	STORE_JUMP(o0, f48, 54f)
    4993:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
    500	STORE_SYNC(o0, f48)
    501	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
    502	STORE_JUMP(o0, f48, 62f)
    503
    5041:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
    505	LOOP_CHUNK1(o1, o0, 1f)
    506	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
    507	LOOP_CHUNK2(o1, o0, 2f)
    508	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
    509	LOOP_CHUNK3(o1, o0, 3f)
    510	ba,pt		%xcc, 1b+4
    511	 faligndata	%f14, %f16, %f48
    5121:	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
    513	STORE_SYNC(o0, f48)
    514	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
    515	STORE_JUMP(o0, f48, 47f)
    5162:	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
    517	STORE_SYNC(o0, f48)
    518	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
    519	STORE_JUMP(o0, f48, 55f)
    5203:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
    521	STORE_SYNC(o0, f48)
    522	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
    523	STORE_JUMP(o0, f48, 63f)
    524
    52540:	FINISH_VISCHUNK(o0, f0,  f2)
    52641:	FINISH_VISCHUNK(o0, f2,  f4)
    52742:	FINISH_VISCHUNK(o0, f4,  f6)
    52843:	FINISH_VISCHUNK(o0, f6,  f8)
    52944:	FINISH_VISCHUNK(o0, f8,  f10)
    53045:	FINISH_VISCHUNK(o0, f10, f12)
    53146:	FINISH_VISCHUNK(o0, f12, f14)
    53247:	UNEVEN_VISCHUNK(o0, f14, f0)
    53348:	FINISH_VISCHUNK(o0, f16, f18)
    53449:	FINISH_VISCHUNK(o0, f18, f20)
    53550:	FINISH_VISCHUNK(o0, f20, f22)
    53651:	FINISH_VISCHUNK(o0, f22, f24)
    53752:	FINISH_VISCHUNK(o0, f24, f26)
    53853:	FINISH_VISCHUNK(o0, f26, f28)
    53954:	FINISH_VISCHUNK(o0, f28, f30)
    54055:	UNEVEN_VISCHUNK(o0, f30, f0)
    54156:	FINISH_VISCHUNK(o0, f32, f34)
    54257:	FINISH_VISCHUNK(o0, f34, f36)
    54358:	FINISH_VISCHUNK(o0, f36, f38)
    54459:	FINISH_VISCHUNK(o0, f38, f40)
    54560:	FINISH_VISCHUNK(o0, f40, f42)
    54661:	FINISH_VISCHUNK(o0, f42, f44)
    54762:	FINISH_VISCHUNK(o0, f44, f46)
    54863:	UNEVEN_VISCHUNK_LAST(o0, f46, f0)
    549
    55093:	EX_LD_FP(LOAD(ldd, %o1, %f2), U1_g3_0_fp)
    551	add		%o1, 8, %o1
    552	subcc		%g3, 8, %g3
    553	faligndata	%f0, %f2, %f8
    554	EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
    555	bl,pn		%xcc, 95f
    556	 add		%o0, 8, %o0
    557	EX_LD_FP(LOAD(ldd, %o1, %f0), U1_g3_0_fp)
    558	add		%o1, 8, %o1
    559	subcc		%g3, 8, %g3
    560	faligndata	%f2, %f0, %f8
    561	EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
    562	bge,pt		%xcc, 93b
    563	 add		%o0, 8, %o0
    564
    56595:	brz,pt		%o2, 2f
    566	 mov		%g1, %o1
    567
    5681:	EX_LD_FP(LOAD(ldub, %o1, %o3), U1_o2_0_fp)
    569	add		%o1, 1, %o1
    570	subcc		%o2, 1, %o2
    571	EX_ST_FP(STORE(stb, %o3, %o0), U1_o2_1_fp)
    572	bne,pt		%xcc, 1b
    573	 add		%o0, 1, %o0
    574
    5752:	membar		#StoreLoad | #StoreStore
    576	VISExit
    577	retl
    578	 mov		EX_RETVAL(%o4), %o0
    579
    580	.align		64
    58170:	/* 16 < len <= (5 * 64) */
    582	bne,pn		%XCC, 75f
    583	 sub		%o0, %o1, %o3
    584
    58572:	andn		%o2, 0xf, %GLOBAL_SPARE
    586	and		%o2, 0xf, %o2
    5871:	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U1_gs_0)
    588	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U1_gs_0)
    589	subcc		%GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
    590	EX_ST(STORE(stx, %o5, %o1 + %o3), U1_gs_10)
    591	add		%o1, 0x8, %o1
    592	EX_ST(STORE(stx, %g1, %o1 + %o3), U1_gs_8)
    593	bgu,pt		%XCC, 1b
    594	 add		%o1, 0x8, %o1
    59573:	andcc		%o2, 0x8, %g0
    596	be,pt		%XCC, 1f
    597	 nop
    598	EX_LD(LOAD(ldx, %o1, %o5), U1_o2_0)
    599	sub		%o2, 0x8, %o2
    600	EX_ST(STORE(stx, %o5, %o1 + %o3), U1_o2_8)
    601	add		%o1, 0x8, %o1
    6021:	andcc		%o2, 0x4, %g0
    603	be,pt		%XCC, 1f
    604	 nop
    605	EX_LD(LOAD(lduw, %o1, %o5), U1_o2_0)
    606	sub		%o2, 0x4, %o2
    607	EX_ST(STORE(stw, %o5, %o1 + %o3), U1_o2_4)
    608	add		%o1, 0x4, %o1
    6091:	cmp		%o2, 0
    610	be,pt		%XCC, 85f
    611	 nop
    612	ba,pt		%xcc, 90f
    613	 nop
    614
    61575:	andcc		%o0, 0x7, %g1
    616	sub		%g1, 0x8, %g1
    617	be,pn		%icc, 2f
    618	 sub		%g0, %g1, %g1
    619	sub		%o2, %g1, %o2
    620
    6211:	EX_LD(LOAD(ldub, %o1, %o5), U1_g1_0)
    622	subcc		%g1, 1, %g1
    623	EX_ST(STORE(stb, %o5, %o1 + %o3), U1_g1_1)
    624	bgu,pt		%icc, 1b
    625	 add		%o1, 1, %o1
    626
    6272:	add		%o1, %o3, %o0
    628	andcc		%o1, 0x7, %g1
    629	bne,pt		%icc, 8f
    630	 sll		%g1, 3, %g1
    631
    632	cmp		%o2, 16
    633	bgeu,pt		%icc, 72b
    634	 nop
    635	ba,a,pt		%xcc, 73b
    636
    6378:	mov		64, %o3
    638	andn		%o1, 0x7, %o1
    639	EX_LD(LOAD(ldx, %o1, %g2), U1_o2_0)
    640	sub		%o3, %g1, %o3
    641	andn		%o2, 0x7, %GLOBAL_SPARE
    642	sllx		%g2, %g1, %g2
    6431:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U1_gs_0_o2_adj)
    644	subcc		%GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
    645	add		%o1, 0x8, %o1
    646	srlx		%g3, %o3, %o5
    647	or		%o5, %g2, %o5
    648	EX_ST(STORE(stx, %o5, %o0), U1_gs_8_o2_adj)
    649	add		%o0, 0x8, %o0
    650	bgu,pt		%icc, 1b
    651	 sllx		%g3, %g1, %g2
    652
    653	srl		%g1, 3, %g1
    654	andcc		%o2, 0x7, %o2
    655	be,pn		%icc, 85f
    656	 add		%o1, %g1, %o1
    657	ba,pt		%xcc, 90f
    658	 sub		%o0, %o1, %o3
    659
    660	.align		64
    66180:	/* 0 < len <= 16 */
    662	andcc		%o3, 0x3, %g0
    663	bne,pn		%XCC, 90f
    664	 sub		%o0, %o1, %o3
    665
    6661:	EX_LD(LOAD(lduw, %o1, %g1), U1_o2_0)
    667	subcc		%o2, 4, %o2
    668	EX_ST(STORE(stw, %g1, %o1 + %o3), U1_o2_4)
    669	bgu,pt		%XCC, 1b
    670	 add		%o1, 4, %o1
    671
    67285:	retl
    673	 mov		EX_RETVAL(%o4), %o0
    674
    675	.align		32
    67690:	EX_LD(LOAD(ldub, %o1, %g1), U1_o2_0)
    677	subcc		%o2, 1, %o2
    678	EX_ST(STORE(stb, %g1, %o1 + %o3), U1_o2_1)
    679	bgu,pt		%XCC, 90b
    680	 add		%o1, 1, %o1
    681	retl
    682	 mov		EX_RETVAL(%o4), %o0
    683
    684	.size		FUNC_NAME, .-FUNC_NAME
    685EXPORT_SYMBOL(FUNC_NAME)